Fix formatting for all files (#153)

2026-01-08 23:17:54 -05:00 · 2023-08-20 11:35:28 +03:00
parent e04bd928e6
commit b6c87c3fd8
98 changed files with 17906 additions and 19741 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -4,7 +4,7 @@ AlignConsecutiveMacros: true
 AlignTrailingComments: true
 AllowAllParametersOfDeclarationOnNextLine: true
 AllowShortBlocksOnASingleLine: true
-AllowShortCaseLabelsOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: false
 AllowShortFunctionsOnASingleLine: All
 AllowShortIfStatementsOnASingleLine: true
 AlwaysBreakTemplateDeclarations: true
@@ -23,9 +23,9 @@ DisableFormat: false
 IndentFunctionDeclarationAfterType: false
 IndentWidth: 2
 KeepEmptyLinesAtTheStartOfBlocks: false
-MaxEmptyLinesToKeep: 2
+MaxEmptyLinesToKeep: 1
 NamespaceIndentation: All
-PointerAlignment: Right
+PointerAlignment: Left
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeParens: ControlStatements
 SpaceInEmptyParentheses: false
--- a/.rustfmt.toml
+++ b/.rustfmt.toml
@@ -0,0 +1,10 @@
+# https://github.com/rust-lang/rustfmt/blob/master/Configurations.md
+
+# Stable Configs
+chain_width = 0
+max_width = 120
+merge_derives = true
+use_field_init_shorthand = true
+use_try_shorthand = true
+
+# Unstable Configs
--- a/benches/msm.rs
+++ b/benches/msm.rs
@@ -34,16 +34,20 @@ fn bench_msm(c: &mut Criterion) {
            #[cfg(feature = "g2")]
            let mut d_g2_points = DeviceBuffer::from_slice(&g2_batch_points[..]).unwrap();

-            group.sample_size(30).bench_function(
-                &format!("MSM of size 2^{} in batch {}", log_msm_size, batch_size),
-                |b| b.iter(|| commit_batch_bls12_381(&mut d_points, &mut d_scalars, batch_size)),
-            );
+            group
+                .sample_size(30)
+                .bench_function(
+                    &format!("MSM of size 2^{} in batch {}", log_msm_size, batch_size),
+                    |b| b.iter(|| commit_batch_bls12_381(&mut d_points, &mut d_scalars, batch_size)),
+                );

            #[cfg(feature = "g2")]
-            group.sample_size(10).bench_function(
-                &format!("G2 MSM of size 2^{} in batch {}", log_msm_size, batch_size),
-                |b| b.iter(|| commit_batch_g2(&mut d_g2_points, &mut d_scalars, batch_size))
-            );
+            group
+                .sample_size(10)
+                .bench_function(
+                    &format!("G2 MSM of size 2^{} in batch {}", log_msm_size, batch_size),
+                    |b| b.iter(|| commit_batch_g2(&mut d_g2_points, &mut d_scalars, batch_size)),
+                );
        }
    }
 }
--- a/benches/ntt.rs
+++ b/benches/ntt.rs
@@ -21,46 +21,59 @@ fn bench_ntt(c: &mut Criterion) {

            let (_, mut d_evals, mut d_domain) = set_up_scalars_bls12_381(ntt_size * batch_size, log_ntt_size, true);

-            group.sample_size(scalar_samples).bench_function(
-                &format!("Scalar NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
-                |b| b.iter(|| evaluate_scalars_batch_bls12_381(&mut d_evals, &mut d_domain, batch_size))
-            );
-            
-            group.sample_size(scalar_samples).bench_function(
-                &format!("Scalar iNTT of size 2^{} in batch {}", log_ntt_size, batch_size),
-                |b| b.iter(|| interpolate_scalars_batch_bls12_381(&mut d_evals, &mut d_domain, batch_size))
-            );
+            group
+                .sample_size(scalar_samples)
+                .bench_function(
+                    &format!("Scalar NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
+                    |b| b.iter(|| evaluate_scalars_batch_bls12_381(&mut d_evals, &mut d_domain, batch_size)),
+                );

-            group.sample_size(scalar_samples).bench_function(
-                &format!("Scalar inplace NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
-                |b| b.iter(|| ntt_inplace_batch_bls12_381(&mut d_evals, &mut d_domain, batch_size, false, 0))
-            );
-            
-            group.sample_size(scalar_samples).bench_function(
-                &format!("Scalar inplace iNTT of size 2^{} in batch {}", log_ntt_size, batch_size),
-                |b| b.iter(|| ntt_inplace_batch_bls12_381(&mut d_evals, &mut d_domain, batch_size, true, 0))
-            );
+            group
+                .sample_size(scalar_samples)
+                .bench_function(
+                    &format!("Scalar iNTT of size 2^{} in batch {}", log_ntt_size, batch_size),
+                    |b| b.iter(|| interpolate_scalars_batch_bls12_381(&mut d_evals, &mut d_domain, batch_size)),
+                );
+
+            group
+                .sample_size(scalar_samples)
+                .bench_function(
+                    &format!("Scalar inplace NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
+                    |b| b.iter(|| ntt_inplace_batch_bls12_381(&mut d_evals, &mut d_domain, batch_size, false, 0)),
+                );
+
+            group
+                .sample_size(scalar_samples)
+                .bench_function(
+                    &format!("Scalar inplace iNTT of size 2^{} in batch {}", log_ntt_size, batch_size),
+                    |b| b.iter(|| ntt_inplace_batch_bls12_381(&mut d_evals, &mut d_domain, batch_size, true, 0)),
+                );

            drop(d_evals);
            drop(d_domain);

-            if ntt_size * batch_size > 1 << 18{
+            if ntt_size * batch_size > 1 << 18 {
                continue;
            }

            let point_samples = 10;

-            let (_, mut d_points_evals, mut d_domain) = set_up_points_bls12_381(ntt_size * batch_size, log_ntt_size, true);
-            
-            group.sample_size(point_samples).bench_function(
-                &format!("EC NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
-                |b| b.iter(|| interpolate_points_batch_bls12_381(&mut d_points_evals, &mut d_domain, batch_size))
-            );
+            let (_, mut d_points_evals, mut d_domain) =
+                set_up_points_bls12_381(ntt_size * batch_size, log_ntt_size, true);

-            group.sample_size(point_samples).bench_function(
-                &format!("EC iNTT of size 2^{} in batch {}", log_ntt_size, batch_size),
-                |b| b.iter(|| evaluate_points_batch_bls12_381(&mut d_points_evals, &mut d_domain, batch_size))
-            );
+            group
+                .sample_size(point_samples)
+                .bench_function(
+                    &format!("EC NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
+                    |b| b.iter(|| interpolate_points_batch_bls12_381(&mut d_points_evals, &mut d_domain, batch_size)),
+                );
+
+            group
+                .sample_size(point_samples)
+                .bench_function(
+                    &format!("EC iNTT of size 2^{} in batch {}", log_ntt_size, batch_size),
+                    |b| b.iter(|| evaluate_points_batch_bls12_381(&mut d_points_evals, &mut d_domain, batch_size)),
+                );

            drop(d_points_evals);
            drop(d_domain);
@@ -70,4 +83,3 @@ fn bench_ntt(c: &mut Criterion) {

 criterion_group!(ntt_benches, bench_ntt);
 criterion_main!(ntt_benches);
-
--- a/build.rs
+++ b/build.rs
@@ -26,8 +26,6 @@ fn main() {
    nvcc.debug(false);
    nvcc.flag(&arch);
    nvcc.flag(&stream);
-    nvcc.files([
-        "./icicle/curves/index.cu",
-    ]);
+    nvcc.files(["./icicle/curves/index.cu"]);
    nvcc.compile("ingo_icicle"); //TODO: extension??
 }
--- a/curve_parameters/new_curve_script.py
+++ b/curve_parameters/new_curve_script.py
@@ -204,14 +204,14 @@ newpath = f'./icicle/curves/{curve_name_lower}'
 if not os.path.exists(newpath):
    os.makedirs(newpath)

-with open("./icicle/curves/curve_template/params.cuh", "r") as params_file:
+with open("./icicle/curves/curve_template/params.cuh.tmpl", "r") as params_file:
    params_file_template = Template(params_file.read())
    params = get_params(config)
    params_content = params_file_template.safe_substitute(params)
    with open(f'./icicle/curves/{curve_name_lower}/params.cuh', 'w') as f:
        f.write(params_content)

-with open("./icicle/curves/curve_template/lde.cu", "r") as lde_file:
+with open("./icicle/curves/curve_template/lde.cu.tmpl", "r") as lde_file:
    template_content = Template(lde_file.read())
    lde_content = template_content.safe_substitute(
        CURVE_NAME_U=curve_name_upper, 
@@ -220,7 +220,7 @@ with open("./icicle/curves/curve_template/lde.cu", "r") as lde_file:
    with open(f'./icicle/curves/{curve_name_lower}/lde.cu', 'w') as f:
        f.write(lde_content)
    
-with open("./icicle/curves/curve_template/msm.cu", "r") as msm_file:
+with open("./icicle/curves/curve_template/msm.cu.tmpl", "r") as msm_file:
    template_content = Template(msm_file.read())
    msm_content = template_content.safe_substitute(
        CURVE_NAME_U=curve_name_upper, 
@@ -229,7 +229,7 @@ with open("./icicle/curves/curve_template/msm.cu", "r") as msm_file:
    with open(f'./icicle/curves/{curve_name_lower}/msm.cu', 'w') as f:
        f.write(msm_content)

-with open("./icicle/curves/curve_template/ve_mod_mult.cu", "r") as ve_mod_mult_file:
+with open("./icicle/curves/curve_template/ve_mod_mult.cu.tmpl", "r") as ve_mod_mult_file:
    template_content = Template(ve_mod_mult_file.read())
    ve_mod_mult_content = template_content.safe_substitute(
        CURVE_NAME_U=curve_name_upper, 
@@ -239,7 +239,7 @@ with open("./icicle/curves/curve_template/ve_mod_mult.cu", "r") as ve_mod_mult_f
        f.write(ve_mod_mult_content)
    

-with open(f'./icicle/curves/curve_template/curve_config.cuh', 'r') as cc:
+with open(f'./icicle/curves/curve_template/curve_config.cuh.tmpl', 'r') as cc:
    template_content = Template(cc.read())
    cc_content = template_content.safe_substitute(
        CURVE_NAME_U=curve_name_upper,
@@ -248,7 +248,7 @@ with open(f'./icicle/curves/curve_template/curve_config.cuh', 'r') as cc:
        f.write(cc_content)
    

-with open(f'./icicle/curves/curve_template/projective.cu', 'r') as proj:
+with open(f'./icicle/curves/curve_template/projective.cu.tmpl', 'r') as proj:
    template_content = Template(proj.read())
    proj_content = template_content.safe_substitute(
        CURVE_NAME_U=curve_name_upper, 
@@ -258,7 +258,7 @@ with open(f'./icicle/curves/curve_template/projective.cu', 'r') as proj:
        f.write(proj_content)


-with open(f'./icicle/curves/curve_template/supported_operations.cu', 'r') as supp_ops:
+with open(f'./icicle/curves/curve_template/supported_operations.cu.tmpl', 'r') as supp_ops:
    template_content = Template(supp_ops.read())
    supp_ops_content = template_content.safe_substitute()
    with open(f'./icicle/curves/{curve_name_lower}/supported_operations.cu', 'w') as f:
--- a/examples/ntt/main.rs
+++ b/examples/ntt/main.rs
@@ -1,9 +1,6 @@
 use std::time::Instant;

-use icicle_utils::{
-    curves::bls12_381::{Point_BLS12_381, ScalarField_BLS12_381},
-    test_bls12_381::*,
-};
+use icicle_utils::{curves::bls12_381::ScalarField_BLS12_381, test_bls12_381::*};
 use rustacuda::prelude::DeviceBuffer;

 const LOG_NTT_SIZES: [usize; 3] = [20, 10, 9];
@@ -22,13 +19,7 @@ fn bench_lde() {
                d_twiddles: &mut DeviceBuffer<ScalarField_BLS12_381>,
                batch_size: usize,
            ) -> i32 {
-                ntt_inplace_batch_bls12_381(
-                    d_inout,
-                    d_twiddles,
-                    batch_size,
-                    false,
-                    0,
-                );
+                ntt_inplace_batch_bls12_381(d_inout, d_twiddles, batch_size, false, 0);
                0
            }

@@ -37,13 +28,7 @@ fn bench_lde() {
                d_twiddles: &mut DeviceBuffer<ScalarField_BLS12_381>,
                batch_size: usize,
            ) -> i32 {
-                ntt_inplace_batch_bls12_381(
-                    d_inout,
-                    d_twiddles,
-                    batch_size,
-                    true,
-                    0,
-                );
+                ntt_inplace_batch_bls12_381(d_inout, d_twiddles, batch_size, true, 0);
                0
            }

@@ -129,16 +114,8 @@ fn bench_ntt_template<E, S, R>(
    ntt_size: usize,
    batch_size: usize,
    log_ntt_size: usize,
-    set_data: fn(
-        test_size: usize,
-        log_domain_size: usize,
-        inverse: bool,
-    ) -> (Vec<E>, DeviceBuffer<E>, DeviceBuffer<S>),
-    bench_fn: fn(
-        d_evaluations: &mut DeviceBuffer<E>,
-        d_domain: &mut DeviceBuffer<S>,
-        batch_size: usize,
-    ) -> R,
+    set_data: fn(test_size: usize, log_domain_size: usize, inverse: bool) -> (Vec<E>, DeviceBuffer<E>, DeviceBuffer<S>),
+    bench_fn: fn(d_evaluations: &mut DeviceBuffer<E>, d_domain: &mut DeviceBuffer<S>, batch_size: usize) -> R,
    id: &str,
    inverse: bool,
    samples: usize,
@@ -159,7 +136,7 @@ fn bench_ntt_template<E, S, R>(
    let first = bench_fn(&mut d_evals, &mut d_domain, batch_size);

    let start = Instant::now();
-    for i in 0..samples {
+    for _ in 0..samples {
        bench_fn(&mut d_evals, &mut d_domain, batch_size);
    }
    let elapsed = start.elapsed();
--- a/goicicle/curves/bls12377/include/msm.h
+++ b/goicicle/curves/bls12377/include/msm.h
@@ -1,23 +1,23 @@

-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Code generated by Ingonyama DO NOT EDIT

-#include <stdbool.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <stdbool.h>
 // msm.h

 #ifndef _BLS12_377_MSM_H
@@ -35,24 +35,61 @@ typedef struct BLS12_377_g2_affine_t BLS12_377_g2_affine_t;
 typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
 typedef cudaStream_t CudaStream_t;

-int msm_cuda_bls12_377(BLS12_377_projective_t* out, BLS12_377_affine_t* points,
-                   BLS12_377_scalar_t* scalars, size_t count, size_t device_id);
+int msm_cuda_bls12_377(
+  BLS12_377_projective_t* out, BLS12_377_affine_t* points, BLS12_377_scalar_t* scalars, size_t count, size_t device_id);

-int msm_batch_cuda_bls12_377(BLS12_377_projective_t* out, BLS12_377_affine_t* points,
-                         BLS12_377_scalar_t* scalars, size_t batch_size,
-                         size_t msm_size, size_t device_id);
+int msm_batch_cuda_bls12_377(
+  BLS12_377_projective_t* out,
+  BLS12_377_affine_t* points,
+  BLS12_377_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);

-int commit_cuda_bls12_377(BLS12_377_projective_t* d_out, BLS12_377_scalar_t* d_scalars,
-                      BLS12_377_affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id);
+int commit_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);

-int commit_batch_cuda_bls12_377(BLS12_377_projective_t* d_out, BLS12_377_scalar_t* d_scalars,
-                            BLS12_377_affine_t* d_points, size_t count,
-                            size_t batch_size, size_t device_id);
+int commit_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id);

-int msm_g2_cuda_bls12_377(BLS12_377_g2_projective_t *out, BLS12_377_g2_affine_t* points, BLS12_377_scalar_t* scalars, size_t count, size_t device_id);
-int msm_batch_g2_cuda_bls12_377(BLS12_377_g2_projective_t* out, BLS12_377_g2_affine_t* points, BLS12_377_scalar_t* scalars, size_t batch_size, size_t msm_size, size_t device_id);
-int commit_g2_cuda_bls12_377(BLS12_377_g2_projective_t* d_out, BLS12_377_scalar_t* d_scalars, BLS12_377_g2_affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id);
-int commit_batch_g2_cuda_bls12_377(BLS12_377_g2_projective_t* d_out, BLS12_377_scalar_t* d_scalars, BLS12_377_g2_affine_t* d_points, size_t count, size_t batch_size, size_t device_id, cudaStream_t stream);
+int msm_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* out,
+  BLS12_377_g2_affine_t* points,
+  BLS12_377_scalar_t* scalars,
+  size_t count,
+  size_t device_id);
+int msm_batch_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* out,
+  BLS12_377_g2_affine_t* points,
+  BLS12_377_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+int commit_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+int commit_batch_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id,
+  cudaStream_t stream);

 #ifdef __cplusplus
 }
--- a/goicicle/curves/bls12377/include/ntt.h
+++ b/goicicle/curves/bls12377/include/ntt.h
@@ -1,22 +1,22 @@

-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Code generated by Ingonyama DO NOT EDIT

-#include <stdbool.h>
 #include <cuda.h>
+#include <stdbool.h>
 // ntt.h

 #ifndef _BLS12_377_NTT_H
@@ -34,34 +34,145 @@ typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
 typedef struct BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
 typedef struct BLS12_377_g2_affine_t BLS12_377_g2_affine_t;

-int ntt_cuda_bls12_377(BLS12_377_scalar_t *arr, uint32_t n, bool inverse, size_t decimation, size_t device_id);
-int ntt_batch_cuda_bls12_377(BLS12_377_scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+int ntt_cuda_bls12_377(BLS12_377_scalar_t* arr, uint32_t n, bool inverse, size_t decimation, size_t device_id);
+int ntt_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);

-int ecntt_cuda_bls12_377(BLS12_377_projective_t *arr, uint32_t n, bool inverse, size_t device_id);
-int ecntt_batch_cuda_bls12_377(BLS12_377_projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+int ecntt_cuda_bls12_377(BLS12_377_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_bls12_377(
+  BLS12_377_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);

-
-BLS12_377_scalar_t* build_domain_cuda_bls12_377(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
-int interpolate_scalars_cuda_bls12_377(BLS12_377_scalar_t* d_out, BLS12_377_scalar_t *d_evaluations, BLS12_377_scalar_t *d_domain, unsigned n, unsigned device_id, size_t stream);
-int interpolate_scalars_batch_cuda_bls12_377(BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_evaluations, BLS12_377_scalar_t* d_domain, unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int interpolate_points_cuda_bls12_377(BLS12_377_projective_t* d_out, BLS12_377_projective_t *d_evaluations, BLS12_377_scalar_t *d_domain, unsigned n, size_t device_id, size_t stream);
-int interpolate_points_batch_cuda_bls12_377(BLS12_377_projective_t* d_out, BLS12_377_projective_t* d_evaluations, BLS12_377_scalar_t* d_domain,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int interpolate_scalars_on_coset_cuda_bls12_377(BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_evaluations, BLS12_377_scalar_t* d_domain, unsigned n, BLS12_377_scalar_t* coset_powers, size_t device_id, size_t stream);
-int interpolate_scalars_batch_on_coset_cuda_bls12_377(BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_evaluations, BLS12_377_scalar_t* d_domain, unsigned n, unsigned batch_size, BLS12_377_scalar_t* coset_powers, size_t device_id, size_t stream);
-int evaluate_scalars_cuda_bls12_377(BLS12_377_scalar_t* d_out, BLS12_377_scalar_t *d_coefficients, BLS12_377_scalar_t *d_domain, unsigned domain_size, unsigned n, unsigned device_id, size_t stream);
-int evaluate_scalars_batch_cuda_bls12_377(BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_coefficients, BLS12_377_scalar_t* d_domain, unsigned domain_size,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int evaluate_points_cuda_bls12_377(BLS12_377_projective_t* d_out, BLS12_377_projective_t *d_coefficients, BLS12_377_scalar_t *d_domain, unsigned domain_size, unsigned n, size_t device_id, size_t stream);
-int evaluate_points_batch_cuda_bls12_377(BLS12_377_projective_t* d_out, BLS12_377_projective_t* d_coefficients, BLS12_377_scalar_t* d_domain, unsigned domain_size,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int evaluate_scalars_on_coset_cuda_bls12_377(BLS12_377_scalar_t* d_out, BLS12_377_scalar_t *d_coefficients, BLS12_377_scalar_t *d_domain, unsigned domain_size,unsigned n, BLS12_377_scalar_t *coset_powers, unsigned device_id, size_t stream);
-int evaluate_scalars_on_coset_batch_cuda_bls12_377(BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_coefficients, BLS12_377_scalar_t* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, BLS12_377_scalar_t *coset_powers, size_t device_id, size_t stream);
-int evaluate_points_on_coset_cuda_bls12_377(BLS12_377_projective_t* d_out, BLS12_377_projective_t *d_coefficients, BLS12_377_scalar_t *d_domain, unsigned domain_size,unsigned n, BLS12_377_scalar_t *coset_powers, size_t device_id, size_t stream);
-int evaluate_points_on_coset_batch_cuda_bls12_377(BLS12_377_projective_t* d_out, BLS12_377_projective_t* d_coefficients, BLS12_377_scalar_t* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, BLS12_377_scalar_t *coset_powers, size_t device_id, size_t stream);
+BLS12_377_scalar_t*
+build_domain_cuda_bls12_377(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
+int interpolate_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int interpolate_scalars_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_on_coset_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_batch_on_coset_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BLS12_377_scalar_t* coset_powers,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
 int reverse_order_scalars_cuda_bls12_377(BLS12_377_scalar_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_scalars_batch_cuda_bls12_377(BLS12_377_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int reverse_order_scalars_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
 int reverse_order_points_cuda_bls12_377(BLS12_377_projective_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_points_batch_cuda_bls12_377(BLS12_377_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
-int add_scalars_cuda_bls12_377(BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_in1, BLS12_377_scalar_t* d_in2, unsigned n, size_t stream);
-int sub_scalars_cuda_bls12_377(BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_in1, BLS12_377_scalar_t* d_in2, unsigned n, size_t stream);
+int reverse_order_points_batch_cuda_bls12_377(
+  BLS12_377_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int add_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_in1, BLS12_377_scalar_t* d_in2, unsigned n, size_t stream);
+int sub_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_in1, BLS12_377_scalar_t* d_in2, unsigned n, size_t stream);
 int to_montgomery_scalars_cuda_bls12_377(BLS12_377_scalar_t* d_inout, unsigned n, size_t stream);
 int from_montgomery_scalars_cuda_bls12_377(BLS12_377_scalar_t* d_inout, unsigned n, size_t stream);

--- a/goicicle/curves/bls12377/include/projective.h
+++ b/goicicle/curves/bls12377/include/projective.h
@@ -1,22 +1,22 @@

-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Code generated by Ingonyama DO NOT EDIT

-#include <stdbool.h>
 #include <cuda.h>
+#include <stdbool.h>
 // projective.h

 #ifdef __cplusplus
@@ -24,25 +24,25 @@ extern "C" {
 #endif

 typedef struct BLS12_377_projective_t BLS12_377_projective_t;
-typedef struct  BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
+typedef struct BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
 typedef struct BLS12_377_affine_t BLS12_377_affine_t;
 typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;

-bool projective_is_on_curve_bls12_377(BLS12_377_projective_t *point1);
+bool projective_is_on_curve_bls12_377(BLS12_377_projective_t* point1);

 BLS12_377_scalar_t* random_scalar_bls12_377();
 BLS12_377_projective_t* random_projective_bls12_377();
 BLS12_377_projective_t* projective_zero_bls12_377();
-BLS12_377_affine_t* projective_to_affine_bls12_377(BLS12_377_projective_t *point1);
-BLS12_377_projective_t* projective_from_affine_bls12_377(BLS12_377_affine_t *point1);
+BLS12_377_affine_t* projective_to_affine_bls12_377(BLS12_377_projective_t* point1);
+BLS12_377_projective_t* projective_from_affine_bls12_377(BLS12_377_affine_t* point1);

 BLS12_377_g2_projective_t* random_g2_projective_bls12_377();
-BLS12_377_affine_t* g2_projective_to_affine_bls12_377(BLS12_377_g2_projective_t *point1);
-BLS12_377_g2_projective_t* g2_projective_from_affine_bls12_377(BLS12_377_affine_t *point1);
-bool g2_projective_is_on_curve_bls12_377(BLS12_377_g2_projective_t *point1);
+BLS12_377_affine_t* g2_projective_to_affine_bls12_377(BLS12_377_g2_projective_t* point1);
+BLS12_377_g2_projective_t* g2_projective_from_affine_bls12_377(BLS12_377_affine_t* point1);
+bool g2_projective_is_on_curve_bls12_377(BLS12_377_g2_projective_t* point1);

-bool eq_bls12_377(BLS12_377_projective_t *point1, BLS12_377_projective_t *point2);
-bool eq_g2_bls12_377(BLS12_377_g2_projective_t *point1, BLS12_377_g2_projective_t *point2);
+bool eq_bls12_377(BLS12_377_projective_t* point1, BLS12_377_projective_t* point2);
+bool eq_g2_bls12_377(BLS12_377_g2_projective_t* point1, BLS12_377_g2_projective_t* point2);

 #ifdef __cplusplus
 }
--- a/goicicle/curves/bls12377/include/ve_mod_mult.h
+++ b/goicicle/curves/bls12377/include/ve_mod_mult.h
@@ -1,22 +1,22 @@

-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Code generated by Ingonyama DO NOT EDIT

-#include <stdbool.h>
 #include <cuda.h>
+#include <stdbool.h>
 // ve_mod_mult.h

 #ifndef _BLS12_377_VEC_MULT_H
@@ -29,11 +29,18 @@ extern "C" {
 typedef struct BLS12_377_projective_t BLS12_377_projective_t;
 typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;

-int32_t vec_mod_mult_point_bls12_377(BLS12_377_projective_t *inout, BLS12_377_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_scalar_bls12_377(BLS12_377_scalar_t *inout, BLS12_377_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_device_scalar_bls12_377(BLS12_377_scalar_t *inout, BLS12_377_scalar_t *scalar_vec, size_t n_elements, size_t device_id);
-int32_t matrix_vec_mod_mult_bls12_377(BLS12_377_scalar_t *matrix_flattened, BLS12_377_scalar_t *input, BLS12_377_scalar_t *output, size_t n_elments, size_t device_id);
-
+int32_t vec_mod_mult_point_bls12_377(
+  BLS12_377_projective_t* inout, BLS12_377_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_scalar_bls12_377(
+  BLS12_377_scalar_t* inout, BLS12_377_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_device_scalar_bls12_377(
+  BLS12_377_scalar_t* inout, BLS12_377_scalar_t* scalar_vec, size_t n_elements, size_t device_id);
+int32_t matrix_vec_mod_mult_bls12_377(
+  BLS12_377_scalar_t* matrix_flattened,
+  BLS12_377_scalar_t* input,
+  BLS12_377_scalar_t* output,
+  size_t n_elments,
+  size_t device_id);

 #ifdef __cplusplus
 }
--- a/goicicle/curves/bls12381/include/msm.h
+++ b/goicicle/curves/bls12381/include/msm.h
@@ -1,23 +1,23 @@

-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Code generated by Ingonyama DO NOT EDIT

-#include <stdbool.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <stdbool.h>
 // msm.h

 #ifndef _BLS12_381_MSM_H
@@ -35,24 +35,61 @@ typedef struct BLS12_381_g2_affine_t BLS12_381_g2_affine_t;
 typedef struct BLS12_381_scalar_t BLS12_381_scalar_t;
 typedef cudaStream_t CudaStream_t;

-int msm_cuda_bls12_381(BLS12_381_projective_t* out, BLS12_381_affine_t* points,
-                   BLS12_381_scalar_t* scalars, size_t count, size_t device_id);
+int msm_cuda_bls12_381(
+  BLS12_381_projective_t* out, BLS12_381_affine_t* points, BLS12_381_scalar_t* scalars, size_t count, size_t device_id);

-int msm_batch_cuda_bls12_381(BLS12_381_projective_t* out, BLS12_381_affine_t* points,
-                         BLS12_381_scalar_t* scalars, size_t batch_size,
-                         size_t msm_size, size_t device_id);
+int msm_batch_cuda_bls12_381(
+  BLS12_381_projective_t* out,
+  BLS12_381_affine_t* points,
+  BLS12_381_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);

-int commit_cuda_bls12_381(BLS12_381_projective_t* d_out, BLS12_381_scalar_t* d_scalars,
-                      BLS12_381_affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id);
+int commit_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_scalar_t* d_scalars,
+  BLS12_381_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);

-int commit_batch_cuda_bls12_381(BLS12_381_projective_t* d_out, BLS12_381_scalar_t* d_scalars,
-                            BLS12_381_affine_t* d_points, size_t count,
-                            size_t batch_size, size_t device_id);
+int commit_batch_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_scalar_t* d_scalars,
+  BLS12_381_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id);

-int msm_g2_cuda_bls12_381(BLS12_381_g2_projective_t *out, BLS12_381_g2_affine_t* points, BLS12_381_scalar_t* scalars, size_t count, size_t device_id);
-int msm_batch_g2_cuda_bls12_381(BLS12_381_g2_projective_t* out, BLS12_381_g2_affine_t* points, BLS12_381_scalar_t* scalars, size_t batch_size, size_t msm_size, size_t device_id);
-int commit_g2_cuda_bls12_381(BLS12_381_g2_projective_t* d_out, BLS12_381_scalar_t* d_scalars, BLS12_381_g2_affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id);
-int commit_batch_g2_cuda_bls12_381(BLS12_381_g2_projective_t* d_out, BLS12_381_scalar_t* d_scalars, BLS12_381_g2_affine_t* d_points, size_t count, size_t batch_size, size_t device_id, cudaStream_t stream);
+int msm_g2_cuda_bls12_381(
+  BLS12_381_g2_projective_t* out,
+  BLS12_381_g2_affine_t* points,
+  BLS12_381_scalar_t* scalars,
+  size_t count,
+  size_t device_id);
+int msm_batch_g2_cuda_bls12_381(
+  BLS12_381_g2_projective_t* out,
+  BLS12_381_g2_affine_t* points,
+  BLS12_381_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+int commit_g2_cuda_bls12_381(
+  BLS12_381_g2_projective_t* d_out,
+  BLS12_381_scalar_t* d_scalars,
+  BLS12_381_g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+int commit_batch_g2_cuda_bls12_381(
+  BLS12_381_g2_projective_t* d_out,
+  BLS12_381_scalar_t* d_scalars,
+  BLS12_381_g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id,
+  cudaStream_t stream);

 #ifdef __cplusplus
 }
--- a/goicicle/curves/bls12381/include/ntt.h
+++ b/goicicle/curves/bls12381/include/ntt.h
@@ -1,22 +1,22 @@

-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Code generated by Ingonyama DO NOT EDIT

-#include <stdbool.h>
 #include <cuda.h>
+#include <stdbool.h>
 // ntt.h

 #ifndef _BLS12_381_NTT_H
@@ -34,34 +34,145 @@ typedef struct BLS12_381_scalar_t BLS12_381_scalar_t;
 typedef struct BLS12_381_g2_projective_t BLS12_381_g2_projective_t;
 typedef struct BLS12_381_g2_affine_t BLS12_381_g2_affine_t;

-int ntt_cuda_bls12_381(BLS12_381_scalar_t *arr, uint32_t n, bool inverse, size_t decimation, size_t device_id);
-int ntt_batch_cuda_bls12_381(BLS12_381_scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+int ntt_cuda_bls12_381(BLS12_381_scalar_t* arr, uint32_t n, bool inverse, size_t decimation, size_t device_id);
+int ntt_batch_cuda_bls12_381(
+  BLS12_381_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);

-int ecntt_cuda_bls12_381(BLS12_381_projective_t *arr, uint32_t n, bool inverse, size_t device_id);
-int ecntt_batch_cuda_bls12_381(BLS12_381_projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+int ecntt_cuda_bls12_381(BLS12_381_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_bls12_381(
+  BLS12_381_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);

-
-BLS12_381_scalar_t* build_domain_cuda_bls12_381(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
-int interpolate_scalars_cuda_bls12_381(BLS12_381_scalar_t* d_out, BLS12_381_scalar_t *d_evaluations, BLS12_381_scalar_t *d_domain, unsigned n, unsigned device_id, size_t stream);
-int interpolate_scalars_batch_cuda_bls12_381(BLS12_381_scalar_t* d_out, BLS12_381_scalar_t* d_evaluations, BLS12_381_scalar_t* d_domain, unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int interpolate_points_cuda_bls12_381(BLS12_381_projective_t* d_out, BLS12_381_projective_t *d_evaluations, BLS12_381_scalar_t *d_domain, unsigned n, size_t device_id, size_t stream);
-int interpolate_points_batch_cuda_bls12_381(BLS12_381_projective_t* d_out, BLS12_381_projective_t* d_evaluations, BLS12_381_scalar_t* d_domain,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int interpolate_scalars_on_coset_cuda_bls12_381(BLS12_381_scalar_t* d_out, BLS12_381_scalar_t* d_evaluations, BLS12_381_scalar_t* d_domain, unsigned n, BLS12_381_scalar_t* coset_powers, size_t device_id, size_t stream);
-int interpolate_scalars_batch_on_coset_cuda_bls12_381(BLS12_381_scalar_t* d_out, BLS12_381_scalar_t* d_evaluations, BLS12_381_scalar_t* d_domain, unsigned n, unsigned batch_size, BLS12_381_scalar_t* coset_powers, size_t device_id, size_t stream);
-int evaluate_scalars_cuda_bls12_381(BLS12_381_scalar_t* d_out, BLS12_381_scalar_t *d_coefficients, BLS12_381_scalar_t *d_domain, unsigned domain_size, unsigned n, unsigned device_id, size_t stream);
-int evaluate_scalars_batch_cuda_bls12_381(BLS12_381_scalar_t* d_out, BLS12_381_scalar_t* d_coefficients, BLS12_381_scalar_t* d_domain, unsigned domain_size,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int evaluate_points_cuda_bls12_381(BLS12_381_projective_t* d_out, BLS12_381_projective_t *d_coefficients, BLS12_381_scalar_t *d_domain, unsigned domain_size, unsigned n, size_t device_id, size_t stream);
-int evaluate_points_batch_cuda_bls12_381(BLS12_381_projective_t* d_out, BLS12_381_projective_t* d_coefficients, BLS12_381_scalar_t* d_domain, unsigned domain_size,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int evaluate_scalars_on_coset_cuda_bls12_381(BLS12_381_scalar_t* d_out, BLS12_381_scalar_t *d_coefficients, BLS12_381_scalar_t *d_domain, unsigned domain_size,unsigned n, BLS12_381_scalar_t *coset_powers, unsigned device_id, size_t stream);
-int evaluate_scalars_on_coset_batch_cuda_bls12_381(BLS12_381_scalar_t* d_out, BLS12_381_scalar_t* d_coefficients, BLS12_381_scalar_t* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, BLS12_381_scalar_t *coset_powers, size_t device_id, size_t stream);
-int evaluate_points_on_coset_cuda_bls12_381(BLS12_381_projective_t* d_out, BLS12_381_projective_t *d_coefficients, BLS12_381_scalar_t *d_domain, unsigned domain_size,unsigned n, BLS12_381_scalar_t *coset_powers, size_t device_id, size_t stream);
-int evaluate_points_on_coset_batch_cuda_bls12_381(BLS12_381_projective_t* d_out, BLS12_381_projective_t* d_coefficients, BLS12_381_scalar_t* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, BLS12_381_scalar_t *coset_powers, size_t device_id, size_t stream);
+BLS12_381_scalar_t*
+build_domain_cuda_bls12_381(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
+int interpolate_scalars_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_evaluations,
+  BLS12_381_scalar_t* d_domain,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int interpolate_scalars_batch_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_evaluations,
+  BLS12_381_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_projective_t* d_evaluations,
+  BLS12_381_scalar_t* d_domain,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_batch_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_projective_t* d_evaluations,
+  BLS12_381_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_on_coset_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_evaluations,
+  BLS12_381_scalar_t* d_domain,
+  unsigned n,
+  BLS12_381_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_batch_on_coset_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_evaluations,
+  BLS12_381_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_381_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_batch_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_projective_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_batch_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_projective_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BLS12_381_scalar_t* coset_powers,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_batch_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_381_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_projective_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BLS12_381_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_batch_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_projective_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_381_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
 int reverse_order_scalars_cuda_bls12_381(BLS12_381_scalar_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_scalars_batch_cuda_bls12_381(BLS12_381_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int reverse_order_scalars_batch_cuda_bls12_381(
+  BLS12_381_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
 int reverse_order_points_cuda_bls12_381(BLS12_381_projective_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_points_batch_cuda_bls12_381(BLS12_381_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
-int add_scalars_cuda_bls12_381(BLS12_381_scalar_t* d_out, BLS12_381_scalar_t* d_in1, BLS12_381_scalar_t* d_in2, unsigned n, size_t stream);
-int sub_scalars_cuda_bls12_381(BLS12_381_scalar_t* d_out, BLS12_381_scalar_t* d_in1, BLS12_381_scalar_t* d_in2, unsigned n, size_t stream);
+int reverse_order_points_batch_cuda_bls12_381(
+  BLS12_381_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int add_scalars_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out, BLS12_381_scalar_t* d_in1, BLS12_381_scalar_t* d_in2, unsigned n, size_t stream);
+int sub_scalars_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out, BLS12_381_scalar_t* d_in1, BLS12_381_scalar_t* d_in2, unsigned n, size_t stream);
 int to_montgomery_scalars_cuda_bls12_381(BLS12_381_scalar_t* d_inout, unsigned n, size_t stream);
 int from_montgomery_scalars_cuda_bls12_381(BLS12_381_scalar_t* d_inout, unsigned n, size_t stream);

--- a/goicicle/curves/bls12381/include/projective.h
+++ b/goicicle/curves/bls12381/include/projective.h
@@ -1,22 +1,22 @@

-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Code generated by Ingonyama DO NOT EDIT

-#include <stdbool.h>
 #include <cuda.h>
+#include <stdbool.h>
 // projective.h

 #ifdef __cplusplus
@@ -24,25 +24,25 @@ extern "C" {
 #endif

 typedef struct BLS12_381_projective_t BLS12_381_projective_t;
-typedef struct  BLS12_381_g2_projective_t BLS12_381_g2_projective_t;
+typedef struct BLS12_381_g2_projective_t BLS12_381_g2_projective_t;
 typedef struct BLS12_381_affine_t BLS12_381_affine_t;
 typedef struct BLS12_381_scalar_t BLS12_381_scalar_t;

-bool projective_is_on_curve_bls12_381(BLS12_381_projective_t *point1);
+bool projective_is_on_curve_bls12_381(BLS12_381_projective_t* point1);

 BLS12_381_scalar_t* random_scalar_bls12_381();
 BLS12_381_projective_t* random_projective_bls12_381();
 BLS12_381_projective_t* projective_zero_bls12_381();
-BLS12_381_affine_t* projective_to_affine_bls12_381(BLS12_381_projective_t *point1);
-BLS12_381_projective_t* projective_from_affine_bls12_381(BLS12_381_affine_t *point1);
+BLS12_381_affine_t* projective_to_affine_bls12_381(BLS12_381_projective_t* point1);
+BLS12_381_projective_t* projective_from_affine_bls12_381(BLS12_381_affine_t* point1);

 BLS12_381_g2_projective_t* random_g2_projective_bls12_381();
-BLS12_381_affine_t* g2_projective_to_affine_bls12_381(BLS12_381_g2_projective_t *point1);
-BLS12_381_g2_projective_t* g2_projective_from_affine_bls12_381(BLS12_381_affine_t *point1);
-bool g2_projective_is_on_curve_bls12_381(BLS12_381_g2_projective_t *point1);
+BLS12_381_affine_t* g2_projective_to_affine_bls12_381(BLS12_381_g2_projective_t* point1);
+BLS12_381_g2_projective_t* g2_projective_from_affine_bls12_381(BLS12_381_affine_t* point1);
+bool g2_projective_is_on_curve_bls12_381(BLS12_381_g2_projective_t* point1);

-bool eq_bls12_381(BLS12_381_projective_t *point1, BLS12_381_projective_t *point2);
-bool eq_g2_bls12_381(BLS12_381_g2_projective_t *point1, BLS12_381_g2_projective_t *point2);
+bool eq_bls12_381(BLS12_381_projective_t* point1, BLS12_381_projective_t* point2);
+bool eq_g2_bls12_381(BLS12_381_g2_projective_t* point1, BLS12_381_g2_projective_t* point2);

 #ifdef __cplusplus
 }
--- a/goicicle/curves/bls12381/include/ve_mod_mult.h
+++ b/goicicle/curves/bls12381/include/ve_mod_mult.h
@@ -1,22 +1,22 @@

-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Code generated by Ingonyama DO NOT EDIT

-#include <stdbool.h>
 #include <cuda.h>
+#include <stdbool.h>
 // ve_mod_mult.h

 #ifndef _BLS12_381_VEC_MULT_H
@@ -29,11 +29,18 @@ extern "C" {
 typedef struct BLS12_381_projective_t BLS12_381_projective_t;
 typedef struct BLS12_381_scalar_t BLS12_381_scalar_t;

-int32_t vec_mod_mult_point_bls12_381(BLS12_381_projective_t *inout, BLS12_381_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_scalar_bls12_381(BLS12_381_scalar_t *inout, BLS12_381_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_device_scalar_bls12_381(BLS12_381_scalar_t *inout, BLS12_381_scalar_t *scalar_vec, size_t n_elements, size_t device_id);
-int32_t matrix_vec_mod_mult_bls12_381(BLS12_381_scalar_t *matrix_flattened, BLS12_381_scalar_t *input, BLS12_381_scalar_t *output, size_t n_elments, size_t device_id);
-
+int32_t vec_mod_mult_point_bls12_381(
+  BLS12_381_projective_t* inout, BLS12_381_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_scalar_bls12_381(
+  BLS12_381_scalar_t* inout, BLS12_381_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_device_scalar_bls12_381(
+  BLS12_381_scalar_t* inout, BLS12_381_scalar_t* scalar_vec, size_t n_elements, size_t device_id);
+int32_t matrix_vec_mod_mult_bls12_381(
+  BLS12_381_scalar_t* matrix_flattened,
+  BLS12_381_scalar_t* input,
+  BLS12_381_scalar_t* output,
+  size_t n_elments,
+  size_t device_id);

 #ifdef __cplusplus
 }
--- a/goicicle/curves/bn254/include/msm.h
+++ b/goicicle/curves/bn254/include/msm.h
@@ -1,23 +1,23 @@

-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Code generated by Ingonyama DO NOT EDIT

-#include <stdbool.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <stdbool.h>
 // msm.h

 #ifndef _BN254_MSM_H
@@ -35,24 +35,57 @@ typedef struct BN254_g2_affine_t BN254_g2_affine_t;
 typedef struct BN254_scalar_t BN254_scalar_t;
 typedef cudaStream_t CudaStream_t;

-int msm_cuda_bn254(BN254_projective_t* out, BN254_affine_t* points,
-                   BN254_scalar_t* scalars, size_t count, size_t device_id);
+int msm_cuda_bn254(
+  BN254_projective_t* out, BN254_affine_t* points, BN254_scalar_t* scalars, size_t count, size_t device_id);

-int msm_batch_cuda_bn254(BN254_projective_t* out, BN254_affine_t* points,
-                         BN254_scalar_t* scalars, size_t batch_size,
-                         size_t msm_size, size_t device_id);
+int msm_batch_cuda_bn254(
+  BN254_projective_t* out,
+  BN254_affine_t* points,
+  BN254_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);

-int commit_cuda_bn254(BN254_projective_t* d_out, BN254_scalar_t* d_scalars,
-                      BN254_affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id);
+int commit_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_scalar_t* d_scalars,
+  BN254_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);

-int commit_batch_cuda_bn254(BN254_projective_t* d_out, BN254_scalar_t* d_scalars,
-                            BN254_affine_t* d_points, size_t count,
-                            size_t batch_size, size_t device_id);
+int commit_batch_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_scalar_t* d_scalars,
+  BN254_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id);

-int msm_g2_cuda_bn254(BN254_g2_projective_t *out, BN254_g2_affine_t* points, BN254_scalar_t* scalars, size_t count, size_t device_id);
-int msm_batch_g2_cuda_bn254(BN254_g2_projective_t* out, BN254_g2_affine_t* points, BN254_scalar_t* scalars, size_t batch_size, size_t msm_size, size_t device_id);
-int commit_g2_cuda_bn254(BN254_g2_projective_t* d_out, BN254_scalar_t* d_scalars, BN254_g2_affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id);
-int commit_batch_g2_cuda_bn254(BN254_g2_projective_t* d_out, BN254_scalar_t* d_scalars, BN254_g2_affine_t* d_points, size_t count, size_t batch_size, size_t device_id, cudaStream_t stream);
+int msm_g2_cuda_bn254(
+  BN254_g2_projective_t* out, BN254_g2_affine_t* points, BN254_scalar_t* scalars, size_t count, size_t device_id);
+int msm_batch_g2_cuda_bn254(
+  BN254_g2_projective_t* out,
+  BN254_g2_affine_t* points,
+  BN254_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+int commit_g2_cuda_bn254(
+  BN254_g2_projective_t* d_out,
+  BN254_scalar_t* d_scalars,
+  BN254_g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+int commit_batch_g2_cuda_bn254(
+  BN254_g2_projective_t* d_out,
+  BN254_scalar_t* d_scalars,
+  BN254_g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id,
+  cudaStream_t stream);

 #ifdef __cplusplus
 }
--- a/goicicle/curves/bn254/include/ntt.h
+++ b/goicicle/curves/bn254/include/ntt.h
@@ -1,22 +1,22 @@

-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Code generated by Ingonyama DO NOT EDIT

-#include <stdbool.h>
 #include <cuda.h>
+#include <stdbool.h>
 // ntt.h

 #ifndef _BN254_NTT_H
@@ -34,34 +34,143 @@ typedef struct BN254_scalar_t BN254_scalar_t;
 typedef struct BN254_g2_projective_t BN254_g2_projective_t;
 typedef struct BN254_g2_affine_t BN254_g2_affine_t;

-int ntt_cuda_bn254(BN254_scalar_t *arr, uint32_t n, bool inverse, size_t decimation, size_t device_id);
-int ntt_batch_cuda_bn254(BN254_scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+int ntt_cuda_bn254(BN254_scalar_t* arr, uint32_t n, bool inverse, size_t decimation, size_t device_id);
+int ntt_batch_cuda_bn254(BN254_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);

-int ecntt_cuda_bn254(BN254_projective_t *arr, uint32_t n, bool inverse, size_t device_id);
-int ecntt_batch_cuda_bn254(BN254_projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+int ecntt_cuda_bn254(BN254_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_bn254(
+  BN254_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);

-
-BN254_scalar_t* build_domain_cuda_bn254(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
-int interpolate_scalars_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t *d_evaluations, BN254_scalar_t *d_domain, unsigned n, unsigned device_id, size_t stream);
-int interpolate_scalars_batch_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_evaluations, BN254_scalar_t* d_domain, unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int interpolate_points_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t *d_evaluations, BN254_scalar_t *d_domain, unsigned n, size_t device_id, size_t stream);
-int interpolate_points_batch_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t* d_evaluations, BN254_scalar_t* d_domain,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int interpolate_scalars_on_coset_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_evaluations, BN254_scalar_t* d_domain, unsigned n, BN254_scalar_t* coset_powers, size_t device_id, size_t stream);
-int interpolate_scalars_batch_on_coset_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_evaluations, BN254_scalar_t* d_domain, unsigned n, unsigned batch_size, BN254_scalar_t* coset_powers, size_t device_id, size_t stream);
-int evaluate_scalars_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t *d_coefficients, BN254_scalar_t *d_domain, unsigned domain_size, unsigned n, unsigned device_id, size_t stream);
-int evaluate_scalars_batch_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_coefficients, BN254_scalar_t* d_domain, unsigned domain_size,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int evaluate_points_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t *d_coefficients, BN254_scalar_t *d_domain, unsigned domain_size, unsigned n, size_t device_id, size_t stream);
-int evaluate_points_batch_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t* d_coefficients, BN254_scalar_t* d_domain, unsigned domain_size,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int evaluate_scalars_on_coset_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t *d_coefficients, BN254_scalar_t *d_domain, unsigned domain_size,unsigned n, BN254_scalar_t *coset_powers, unsigned device_id, size_t stream);
-int evaluate_scalars_on_coset_batch_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_coefficients, BN254_scalar_t* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, BN254_scalar_t *coset_powers, size_t device_id, size_t stream);
-int evaluate_points_on_coset_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t *d_coefficients, BN254_scalar_t *d_domain, unsigned domain_size,unsigned n, BN254_scalar_t *coset_powers, size_t device_id, size_t stream);
-int evaluate_points_on_coset_batch_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t* d_coefficients, BN254_scalar_t* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, BN254_scalar_t *coset_powers, size_t device_id, size_t stream);
+BN254_scalar_t*
+build_domain_cuda_bn254(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
+int interpolate_scalars_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int interpolate_scalars_batch_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_batch_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_on_coset_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_batch_on_coset_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_batch_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_batch_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BN254_scalar_t* coset_powers,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_batch_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_batch_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
 int reverse_order_scalars_cuda_bn254(BN254_scalar_t* arr, int n, size_t device_id, size_t stream);
 int reverse_order_scalars_batch_cuda_bn254(BN254_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
 int reverse_order_points_cuda_bn254(BN254_projective_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_points_batch_cuda_bn254(BN254_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
-int add_scalars_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream);
-int sub_scalars_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream);
+int reverse_order_points_batch_cuda_bn254(
+  BN254_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int add_scalars_cuda_bn254(
+  BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream);
+int sub_scalars_cuda_bn254(
+  BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream);
 int to_montgomery_scalars_cuda_bn254(BN254_scalar_t* d_inout, unsigned n, size_t stream);
 int from_montgomery_scalars_cuda_bn254(BN254_scalar_t* d_inout, unsigned n, size_t stream);

--- a/goicicle/curves/bn254/include/projective.h
+++ b/goicicle/curves/bn254/include/projective.h
@@ -1,22 +1,22 @@

-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Code generated by Ingonyama DO NOT EDIT

-#include <stdbool.h>
 #include <cuda.h>
+#include <stdbool.h>
 // projective.h

 #ifdef __cplusplus
@@ -24,25 +24,25 @@ extern "C" {
 #endif

 typedef struct BN254_projective_t BN254_projective_t;
-typedef struct  BN254_g2_projective_t BN254_g2_projective_t;
+typedef struct BN254_g2_projective_t BN254_g2_projective_t;
 typedef struct BN254_affine_t BN254_affine_t;
 typedef struct BN254_scalar_t BN254_scalar_t;

-bool projective_is_on_curve_bn254(BN254_projective_t *point1);
+bool projective_is_on_curve_bn254(BN254_projective_t* point1);

 BN254_scalar_t* random_scalar_bn254();
 BN254_projective_t* random_projective_bn254();
 BN254_projective_t* projective_zero_bn254();
-BN254_affine_t* projective_to_affine_bn254(BN254_projective_t *point1);
-BN254_projective_t* projective_from_affine_bn254(BN254_affine_t *point1);
+BN254_affine_t* projective_to_affine_bn254(BN254_projective_t* point1);
+BN254_projective_t* projective_from_affine_bn254(BN254_affine_t* point1);

 BN254_g2_projective_t* random_g2_projective_bn254();
-BN254_affine_t* g2_projective_to_affine_bn254(BN254_g2_projective_t *point1);
-BN254_g2_projective_t* g2_projective_from_affine_bn254(BN254_affine_t *point1);
-bool g2_projective_is_on_curve_bn254(BN254_g2_projective_t *point1);
+BN254_affine_t* g2_projective_to_affine_bn254(BN254_g2_projective_t* point1);
+BN254_g2_projective_t* g2_projective_from_affine_bn254(BN254_affine_t* point1);
+bool g2_projective_is_on_curve_bn254(BN254_g2_projective_t* point1);

-bool eq_bn254(BN254_projective_t *point1, BN254_projective_t *point2);
-bool eq_g2_bn254(BN254_g2_projective_t *point1, BN254_g2_projective_t *point2);
+bool eq_bn254(BN254_projective_t* point1, BN254_projective_t* point2);
+bool eq_g2_bn254(BN254_g2_projective_t* point1, BN254_g2_projective_t* point2);

 #ifdef __cplusplus
 }
--- a/goicicle/curves/bn254/include/ve_mod_mult.h
+++ b/goicicle/curves/bn254/include/ve_mod_mult.h
@@ -1,22 +1,22 @@

-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Code generated by Ingonyama DO NOT EDIT

-#include <stdbool.h>
 #include <cuda.h>
+#include <stdbool.h>
 // ve_mod_mult.h

 #ifndef _BN254_VEC_MULT_H
@@ -29,11 +29,14 @@ extern "C" {
 typedef struct BN254_projective_t BN254_projective_t;
 typedef struct BN254_scalar_t BN254_scalar_t;

-int32_t vec_mod_mult_point_bn254(BN254_projective_t *inout, BN254_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_scalar_bn254(BN254_scalar_t *inout, BN254_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_device_scalar_bn254(BN254_scalar_t *inout, BN254_scalar_t *scalar_vec, size_t n_elements, size_t device_id);
-int32_t matrix_vec_mod_mult_bn254(BN254_scalar_t *matrix_flattened, BN254_scalar_t *input, BN254_scalar_t *output, size_t n_elments, size_t device_id);
-
+int32_t
+vec_mod_mult_point_bn254(BN254_projective_t* inout, BN254_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t
+vec_mod_mult_scalar_bn254(BN254_scalar_t* inout, BN254_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_device_scalar_bn254(
+  BN254_scalar_t* inout, BN254_scalar_t* scalar_vec, size_t n_elements, size_t device_id);
+int32_t matrix_vec_mod_mult_bn254(
+  BN254_scalar_t* matrix_flattened, BN254_scalar_t* input, BN254_scalar_t* output, size_t n_elments, size_t device_id);

 #ifdef __cplusplus
 }
--- a/goicicle/templates/hfiles/msm.h.tmpl
+++ b/goicicle/templates/hfiles/msm.h.tmpl
@@ -1,6 +1,6 @@
-#include <stdbool.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <stdbool.h>
 // msm.h

 #ifndef _{{.CurveNameUpperCase}}_MSM_H
@@ -18,24 +18,64 @@ typedef struct {{.CurveNameUpperCase}}_g2_affine_t {{.CurveNameUpperCase}}_g2_af
 typedef struct {{.CurveNameUpperCase}}_scalar_t {{.CurveNameUpperCase}}_scalar_t;
 typedef cudaStream_t CudaStream_t;

-int msm_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* out, {{.CurveNameUpperCase}}_affine_t* points,
-                   {{.CurveNameUpperCase}}_scalar_t* scalars, size_t count, size_t device_id);
+int msm_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_projective_t* out, {{.CurveNameUpperCase}}_affine_t* points, {{.CurveNameUpperCase}}_scalar_t* scalars, size_t count, size_t device_id);

-int msm_batch_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* out, {{.CurveNameUpperCase}}_affine_t* points,
-                         {{.CurveNameUpperCase}}_scalar_t* scalars, size_t batch_size,
-                         size_t msm_size, size_t device_id);
+int msm_batch_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_projective_t* out,
+  {{.CurveNameUpperCase}}_affine_t* points,
+  {{.CurveNameUpperCase}}_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);

-int commit_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_scalars,
-                      {{.CurveNameUpperCase}}_affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id);
+int commit_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_projective_t* d_out,
+  {{.CurveNameUpperCase}}_scalar_t* d_scalars,
+  {{.CurveNameUpperCase}}_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);

-int commit_batch_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_scalars,
-                            {{.CurveNameUpperCase}}_affine_t* d_points, size_t count,
-                            size_t batch_size, size_t device_id);
+int commit_batch_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_projective_t* d_out,
+  {{.CurveNameUpperCase}}_scalar_t* d_scalars,
+  {{.CurveNameUpperCase}}_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id);

-int msm_g2_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t *out, {{.CurveNameUpperCase}}_g2_affine_t* points, {{.CurveNameUpperCase}}_scalar_t* scalars, size_t count, size_t device_id);
-int msm_batch_g2_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t* out, {{.CurveNameUpperCase}}_g2_affine_t* points, {{.CurveNameUpperCase}}_scalar_t* scalars, size_t batch_size, size_t msm_size, size_t device_id);
-int commit_g2_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_scalars, {{.CurveNameUpperCase}}_g2_affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id);
-int commit_batch_g2_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_scalars, {{.CurveNameUpperCase}}_g2_affine_t* d_points, size_t count, size_t batch_size, size_t device_id, cudaStream_t stream);
+int msm_g2_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_g2_projective_t* out,
+  {{.CurveNameUpperCase}}_g2_affine_t* points,
+  {{.CurveNameUpperCase}}_scalar_t* scalars,
+  size_t count,
+  size_t device_id);
+
+int msm_batch_g2_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_g2_projective_t* out,
+  {{.CurveNameUpperCase}}_g2_affine_t* points,
+  {{.CurveNameUpperCase}}_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+
+int commit_g2_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_g2_projective_t* d_out,
+  {{.CurveNameUpperCase}}_scalar_t* d_scalars,
+  {{.CurveNameUpperCase}}_g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+
+int commit_batch_g2_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_g2_projective_t* d_out,
+  {{.CurveNameUpperCase}}_scalar_t* d_scalars,
+  {{.CurveNameUpperCase}}_g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id,
+  cudaStream_t stream);

 #ifdef __cplusplus
 }
--- a/goicicle/templates/hfiles/ntt.h.tmpl
+++ b/goicicle/templates/hfiles/ntt.h.tmpl
@@ -1,5 +1,5 @@
-#include <stdbool.h>
 #include <cuda.h>
+#include <stdbool.h>
 // ntt.h

 #ifndef _{{.CurveNameUpperCase}}_NTT_H
@@ -17,34 +17,148 @@ typedef struct {{.CurveNameUpperCase}}_scalar_t {{.CurveNameUpperCase}}_scalar_t
 typedef struct {{.CurveNameUpperCase}}_g2_projective_t {{.CurveNameUpperCase}}_g2_projective_t;
 typedef struct {{.CurveNameUpperCase}}_g2_affine_t {{.CurveNameUpperCase}}_g2_affine_t;

-int ntt_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t *arr, uint32_t n, bool inverse, size_t decimation, size_t device_id);
-int ntt_batch_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+int ntt_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* arr, uint32_t n, bool inverse, size_t decimation, size_t device_id);
+int ntt_batch_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);

-int ecntt_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t *arr, uint32_t n, bool inverse, size_t device_id);
-int ecntt_batch_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+int ecntt_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);

+{{.CurveNameUpperCase}}_scalar_t* 
+build_domain_cuda_{{.CurveNameLowerCase}}(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
+
+int interpolate_scalars_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* d_out,
+  {{.CurveNameUpperCase}}_scalar_t* d_evaluations,
+  {{.CurveNameUpperCase}}_scalar_t* d_domain,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int interpolate_scalars_batch_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* d_out,
+  {{.CurveNameUpperCase}}_scalar_t* d_evaluations,
+  {{.CurveNameUpperCase}}_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_projective_t* d_out,
+  {{.CurveNameUpperCase}}_projective_t* d_evaluations,
+  {{.CurveNameUpperCase}}_scalar_t* d_domain,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_batch_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_projective_t* d_out,
+  {{.CurveNameUpperCase}}_projective_t* d_evaluations,
+  {{.CurveNameUpperCase}}_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_on_coset_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* d_out,
+  {{.CurveNameUpperCase}}_scalar_t* d_evaluations,
+  {{.CurveNameUpperCase}}_scalar_t* d_domain,
+  unsigned n,
+  {{.CurveNameUpperCase}}_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_batch_on_coset_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* d_out,
+  {{.CurveNameUpperCase}}_scalar_t* d_evaluations,
+  {{.CurveNameUpperCase}}_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  {{.CurveNameUpperCase}}_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+
+int evaluate_scalars_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* d_out,
+  {{.CurveNameUpperCase}}_scalar_t* d_coefficients,
+  {{.CurveNameUpperCase}}_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_batch_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* d_out,
+  {{.CurveNameUpperCase}}_scalar_t* d_coefficients,
+  {{.CurveNameUpperCase}}_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_projective_t* d_out,
+  {{.CurveNameUpperCase}}_projective_t* d_coefficients,
+  {{.CurveNameUpperCase}}_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_batch_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_projective_t* d_out,
+  {{.CurveNameUpperCase}}_projective_t* d_coefficients,
+  {{.CurveNameUpperCase}}_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* d_out,
+  {{.CurveNameUpperCase}}_scalar_t* d_coefficients,
+  {{.CurveNameUpperCase}}_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  {{.CurveNameUpperCase}}_scalar_t* coset_powers,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_batch_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* d_out,
+  {{.CurveNameUpperCase}}_scalar_t* d_coefficients,
+  {{.CurveNameUpperCase}}_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  {{.CurveNameUpperCase}}_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_projective_t* d_out,
+  {{.CurveNameUpperCase}}_projective_t* d_coefficients,
+  {{.CurveNameUpperCase}}_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  {{.CurveNameUpperCase}}_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_batch_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_projective_t* d_out,
+  {{.CurveNameUpperCase}}_projective_t* d_coefficients,
+  {{.CurveNameUpperCase}}_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  {{.CurveNameUpperCase}}_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);

-{{.CurveNameUpperCase}}_scalar_t* build_domain_cuda_{{.CurveNameLowerCase}}(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
-int interpolate_scalars_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* d_out, {{.CurveNameUpperCase}}_scalar_t *d_evaluations, {{.CurveNameUpperCase}}_scalar_t *d_domain, unsigned n, unsigned device_id, size_t stream);
-int interpolate_scalars_batch_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_evaluations, {{.CurveNameUpperCase}}_scalar_t* d_domain, unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int interpolate_points_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* d_out, {{.CurveNameUpperCase}}_projective_t *d_evaluations, {{.CurveNameUpperCase}}_scalar_t *d_domain, unsigned n, size_t device_id, size_t stream);
-int interpolate_points_batch_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* d_out, {{.CurveNameUpperCase}}_projective_t* d_evaluations, {{.CurveNameUpperCase}}_scalar_t* d_domain,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int interpolate_scalars_on_coset_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_evaluations, {{.CurveNameUpperCase}}_scalar_t* d_domain, unsigned n, {{.CurveNameUpperCase}}_scalar_t* coset_powers, size_t device_id, size_t stream);
-int interpolate_scalars_batch_on_coset_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_evaluations, {{.CurveNameUpperCase}}_scalar_t* d_domain, unsigned n, unsigned batch_size, {{.CurveNameUpperCase}}_scalar_t* coset_powers, size_t device_id, size_t stream);
-int evaluate_scalars_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* d_out, {{.CurveNameUpperCase}}_scalar_t *d_coefficients, {{.CurveNameUpperCase}}_scalar_t *d_domain, unsigned domain_size, unsigned n, unsigned device_id, size_t stream);
-int evaluate_scalars_batch_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_coefficients, {{.CurveNameUpperCase}}_scalar_t* d_domain, unsigned domain_size,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int evaluate_points_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* d_out, {{.CurveNameUpperCase}}_projective_t *d_coefficients, {{.CurveNameUpperCase}}_scalar_t *d_domain, unsigned domain_size, unsigned n, size_t device_id, size_t stream);
-int evaluate_points_batch_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* d_out, {{.CurveNameUpperCase}}_projective_t* d_coefficients, {{.CurveNameUpperCase}}_scalar_t* d_domain, unsigned domain_size,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
-int evaluate_scalars_on_coset_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* d_out, {{.CurveNameUpperCase}}_scalar_t *d_coefficients, {{.CurveNameUpperCase}}_scalar_t *d_domain, unsigned domain_size,unsigned n, {{.CurveNameUpperCase}}_scalar_t *coset_powers, unsigned device_id, size_t stream);
-int evaluate_scalars_on_coset_batch_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_coefficients, {{.CurveNameUpperCase}}_scalar_t* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, {{.CurveNameUpperCase}}_scalar_t *coset_powers, size_t device_id, size_t stream);
-int evaluate_points_on_coset_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* d_out, {{.CurveNameUpperCase}}_projective_t *d_coefficients, {{.CurveNameUpperCase}}_scalar_t *d_domain, unsigned domain_size,unsigned n, {{.CurveNameUpperCase}}_scalar_t *coset_powers, size_t device_id, size_t stream);
-int evaluate_points_on_coset_batch_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* d_out, {{.CurveNameUpperCase}}_projective_t* d_coefficients, {{.CurveNameUpperCase}}_scalar_t* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, {{.CurveNameUpperCase}}_scalar_t *coset_powers, size_t device_id, size_t stream);
 int reverse_order_scalars_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_scalars_batch_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int reverse_order_scalars_batch_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
 int reverse_order_points_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_points_batch_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
-int add_scalars_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_in1, {{.CurveNameUpperCase}}_scalar_t* d_in2, unsigned n, size_t stream);
-int sub_scalars_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_in1, {{.CurveNameUpperCase}}_scalar_t* d_in2, unsigned n, size_t stream);
+int reverse_order_points_batch_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int add_scalars_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_in1, {{.CurveNameUpperCase}}_scalar_t* d_in2, unsigned n, size_t stream);
+int sub_scalars_cuda_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_in1, {{.CurveNameUpperCase}}_scalar_t* d_in2, unsigned n, size_t stream);
 int to_montgomery_scalars_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* d_inout, unsigned n, size_t stream);
 int from_montgomery_scalars_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* d_inout, unsigned n, size_t stream);

--- a/goicicle/templates/hfiles/projective.h.tmpl
+++ b/goicicle/templates/hfiles/projective.h.tmpl
@@ -1,5 +1,5 @@
-#include <stdbool.h>
 #include <cuda.h>
+#include <stdbool.h>
 // projective.h

 #ifdef __cplusplus
@@ -7,25 +7,25 @@ extern "C" {
 #endif

 typedef struct {{.CurveNameUpperCase}}_projective_t {{.CurveNameUpperCase}}_projective_t;
-typedef struct  {{.CurveNameUpperCase}}_g2_projective_t {{.CurveNameUpperCase}}_g2_projective_t;
+typedef struct {{.CurveNameUpperCase}}_g2_projective_t {{.CurveNameUpperCase}}_g2_projective_t;
 typedef struct {{.CurveNameUpperCase}}_affine_t {{.CurveNameUpperCase}}_affine_t;
 typedef struct {{.CurveNameUpperCase}}_scalar_t {{.CurveNameUpperCase}}_scalar_t;

-bool projective_is_on_curve_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t *point1);
+bool projective_is_on_curve_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* point1);

 {{.CurveNameUpperCase}}_scalar_t* random_scalar_{{.CurveNameLowerCase}}();
 {{.CurveNameUpperCase}}_projective_t* random_projective_{{.CurveNameLowerCase}}();
 {{.CurveNameUpperCase}}_projective_t* projective_zero_{{.CurveNameLowerCase}}();
-{{.CurveNameUpperCase}}_affine_t* projective_to_affine_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t *point1);
-{{.CurveNameUpperCase}}_projective_t* projective_from_affine_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_affine_t *point1);
+{{.CurveNameUpperCase}}_affine_t* projective_to_affine_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* point1);
+{{.CurveNameUpperCase}}_projective_t* projective_from_affine_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_affine_t* point1);

 {{.CurveNameUpperCase}}_g2_projective_t* random_g2_projective_{{.CurveNameLowerCase}}();
-{{.CurveNameUpperCase}}_affine_t* g2_projective_to_affine_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t *point1);
-{{.CurveNameUpperCase}}_g2_projective_t* g2_projective_from_affine_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_affine_t *point1);
-bool g2_projective_is_on_curve_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t *point1);
+{{.CurveNameUpperCase}}_affine_t* g2_projective_to_affine_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t* point1);
+{{.CurveNameUpperCase}}_g2_projective_t* g2_projective_from_affine_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_affine_t* point1);
+bool g2_projective_is_on_curve_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t* point1);

-bool eq_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t *point1, {{.CurveNameUpperCase}}_projective_t *point2);
-bool eq_g2_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t *point1, {{.CurveNameUpperCase}}_g2_projective_t *point2);
+bool eq_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* point1, {{.CurveNameUpperCase}}_projective_t* point2);
+bool eq_g2_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t* point1, {{.CurveNameUpperCase}}_g2_projective_t* point2);

 #ifdef __cplusplus
 }
--- a/goicicle/templates/hfiles/ve_mod_mult.h.tmpl
+++ b/goicicle/templates/hfiles/ve_mod_mult.h.tmpl
@@ -12,11 +12,18 @@ extern "C" {
 typedef struct {{.CurveNameUpperCase}}_projective_t {{.CurveNameUpperCase}}_projective_t;
 typedef struct {{.CurveNameUpperCase}}_scalar_t {{.CurveNameUpperCase}}_scalar_t;

-int32_t vec_mod_mult_point_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t *inout, {{.CurveNameUpperCase}}_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_scalar_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t *inout, {{.CurveNameUpperCase}}_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_device_scalar_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t *inout, {{.CurveNameUpperCase}}_scalar_t *scalar_vec, size_t n_elements, size_t device_id);
-int32_t matrix_vec_mod_mult_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t *matrix_flattened, {{.CurveNameUpperCase}}_scalar_t *input, {{.CurveNameUpperCase}}_scalar_t *output, size_t n_elments, size_t device_id);
-
+int32_t vec_mod_mult_point_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_projective_t* inout, {{.CurveNameUpperCase}}_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_scalar_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* inout, {{.CurveNameUpperCase}}_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_device_scalar_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* inout, {{.CurveNameUpperCase}}_scalar_t* scalar_vec, size_t n_elements, size_t device_id);
+int32_t matrix_vec_mod_mult_{{.CurveNameLowerCase}}(
+  {{.CurveNameUpperCase}}_scalar_t* matrix_flattened,
+  {{.CurveNameUpperCase}}_scalar_t* input,
+  {{.CurveNameUpperCase}}_scalar_t* output,
+  size_t n_elments,
+  size_t device_id);

 #ifdef __cplusplus
 }
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
--- a/icicle/appUtils/msm/msm.cuh
+++ b/icicle/appUtils/msm/msm.cuh
@@ -3,19 +3,46 @@
 #pragma once

 template <typename S, typename P, typename A>
-void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, bool big_triangle, cudaStream_t stream);
+void bucket_method_msm(
+  unsigned bitsize,
+  unsigned c,
+  S* scalars,
+  A* points,
+  unsigned size,
+  P* final_result,
+  bool on_device,
+  bool big_triangle,
+  cudaStream_t stream);

 template <typename S, typename P, typename A>
-void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device, cudaStream_t stream);
+void batched_bucket_method_msm(
+  unsigned bitsize,
+  unsigned c,
+  S* scalars,
+  A* points,
+  unsigned batch_size,
+  unsigned msm_size,
+  P* final_results,
+  bool on_device,
+  cudaStream_t stream);

 template <typename S, typename P, typename A>
-void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream);
+void batched_large_msm(
+  S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream);

 template <typename S, typename P, typename A>
-void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, bool big_triangle, unsigned large_bucket_factor, cudaStream_t stream);
+void large_msm(
+  S* scalars,
+  A* points,
+  unsigned size,
+  P* result,
+  bool on_device,
+  bool big_triangle,
+  unsigned large_bucket_factor,
+  cudaStream_t stream);

 template <typename S, typename P, typename A>
-void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, cudaStream_t stream);
+void short_msm(S* h_scalars, A* h_points, unsigned size, P* h_final_result, cudaStream_t stream);

 template <typename A, typename S, typename P>
 void reference_msm(S* scalars, A* a_points, unsigned size);
--- a/icicle/appUtils/msm/tests/msm_test.cu
+++ b/icicle/appUtils/msm/tests/msm_test.cu
@@ -1,131 +1,115 @@
-#include <iostream>
-#include <chrono>
-#include <vector>
-#include "msm.cu"
-#include "../../utils/cuda_utils.cuh"
-#include "../../primitives/projective.cuh"
 #include "../../primitives/field.cuh"
+#include "../../primitives/projective.cuh"
+#include "../../utils/cuda_utils.cuh"
+#include "msm.cu"
+#include <chrono>
+#include <iostream>
+#include <vector>
 // #include "../../curves/bls12_377/curve_config.cuh"
 #include "../../curves/bn254/curve_config.cuh"

 // using namespace BLS12_377;
 using namespace BN254;

-class Dummy_Scalar {
-  public:
-    static constexpr unsigned NBITS = 32;
+class Dummy_Scalar
+{
+public:
+  static constexpr unsigned NBITS = 32;

-    unsigned x;
-    unsigned p = 10;
-    // unsigned p = 1<<30;
+  unsigned x;
+  unsigned p = 10;
+  // unsigned p = 1<<30;

-    static HOST_DEVICE_INLINE Dummy_Scalar zero() {
-      return {0};
-    }
+  static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }

-    static HOST_DEVICE_INLINE Dummy_Scalar one() {
-      return {1};
-    }
+  static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }

-    friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar) {
-      os << scalar.x;
-      return os;
-    }
+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
+  {
+    os << scalar.x;
+    return os;
+  }

-    HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) {
-      return (x>>(digit_num*digit_width))&((1<<digit_width)-1);
-    }
+  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width)
+  {
+    return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
+  }

-    friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2) {   
-      return {(p1.x+p2.x)%p1.p};
-    }
+  friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
+  {
+    return {(p1.x + p2.x) % p1.p};
+  }

-    friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) {
-      return (p1.x == p2.x);
-    }
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }

-    friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) {
-      return (p1.x == p2);
-    }
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }

-    static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar &scalar) { 
-      return {scalar.p-scalar.x};
-    }
-    static HOST_INLINE Dummy_Scalar rand_host() {
-      return {(unsigned)rand()%10};
-      // return {(unsigned)rand()};
-    }
+  static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
+  static HOST_INLINE Dummy_Scalar rand_host()
+  {
+    return {(unsigned)rand() % 10};
+    // return {(unsigned)rand()};
+  }
 };

-class Dummy_Projective {
+class Dummy_Projective
+{
+public:
+  Dummy_Scalar x;

-  public:
-    Dummy_Scalar x;
+  static HOST_DEVICE_INLINE Dummy_Projective zero() { return {0}; }

-    static HOST_DEVICE_INLINE Dummy_Projective zero() {
-      return {0};
+  static HOST_DEVICE_INLINE Dummy_Projective one() { return {1}; }
+
+  static HOST_DEVICE_INLINE Dummy_Projective to_affine(const Dummy_Projective& point) { return {point.x}; }
+
+  static HOST_DEVICE_INLINE Dummy_Projective from_affine(const Dummy_Projective& point) { return {point.x}; }
+
+  static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective& point) { return {Dummy_Scalar::neg(point.x)}; }
+
+  friend HOST_DEVICE_INLINE Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2)
+  {
+    return {p1.x + p2.x};
+  }
+
+  // friend HOST_DEVICE_INLINE Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) {
+  //   return p1 + neg(p2);
+  // }
+
+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point)
+  {
+    os << point.x;
+    return os;
+  }
+
+  friend HOST_DEVICE_INLINE Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point)
+  {
+    Dummy_Projective res = zero();
+#ifdef CUDA_ARCH
+#pragma unroll
+#endif
+    for (int i = 0; i < Dummy_Scalar::NBITS; i++) {
+      if (i > 0) { res = res + res; }
+      if (scalar.get_scalar_digit(Dummy_Scalar::NBITS - i - 1, 1)) { res = res + point; }
    }
+    return res;
+  }

-    static HOST_DEVICE_INLINE Dummy_Projective one() {
-      return {1};
-    }
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2)
+  {
+    return (p1.x == p2.x);
+  }

-    static HOST_DEVICE_INLINE Dummy_Projective to_affine(const Dummy_Projective &point) {
-      return {point.x};
-    }
+  static HOST_DEVICE_INLINE bool is_zero(const Dummy_Projective& point) { return point.x == 0; }

-    static HOST_DEVICE_INLINE Dummy_Projective from_affine(const Dummy_Projective &point) {
-      return {point.x};
-    }
-
-    static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective &point) { 
-      return {Dummy_Scalar::neg(point.x)};
-    }
-
-    friend HOST_DEVICE_INLINE Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2) {   
-      return {p1.x+p2.x};
-    }
-
-    // friend HOST_DEVICE_INLINE Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) {   
-    //   return p1 + neg(p2);
-    // }
-
-    friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point) {
-      os << point.x;
-      return os;
-    }
-
-    friend HOST_DEVICE_INLINE Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point) {   
-      Dummy_Projective res = zero();
-  #ifdef CUDA_ARCH
-  #pragma unroll
-  #endif
-      for (int i = 0; i < Dummy_Scalar::NBITS; i++) {
-        if (i > 0) {
-          res = res + res;
-        }
-        if (scalar.get_scalar_digit(Dummy_Scalar::NBITS - i - 1, 1)) {
-          res = res + point;
-        }
-      }
-      return res;
-    }
-
-    friend HOST_DEVICE_INLINE bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2) {
-      return (p1.x == p2.x);
-    }
-
-    static HOST_DEVICE_INLINE bool is_zero(const Dummy_Projective &point) {
-      return point.x == 0;
-    }
-
-    static HOST_INLINE Dummy_Projective rand_host() {
-      return {(unsigned)rand()%10};
-      // return {(unsigned)rand()};
-    }
+  static HOST_INLINE Dummy_Projective rand_host()
+  {
+    return {(unsigned)rand() % 10};
+    // return {(unsigned)rand()};
+  }
 };

-//switch between dummy and real:
+// switch between dummy and real:

 typedef scalar_t test_scalar;
 typedef projective_t test_projective;
@@ -138,62 +122,62 @@ typedef affine_t test_affine;
 int main()
 {
  unsigned batch_size = 1;
-//   unsigned msm_size = 1<<21;
+  //   unsigned msm_size = 1<<21;
  unsigned msm_size = 12180757;
-  unsigned N = batch_size*msm_size;
+  unsigned N = batch_size * msm_size;

-  test_scalar *scalars = new test_scalar[N];
-  test_affine *points = new test_affine[N];
-  
-  for (unsigned i=0;i<N;i++){
+  test_scalar* scalars = new test_scalar[N];
+  test_affine* points = new test_affine[N];
+
+  for (unsigned i = 0; i < N; i++) {
    // scalars[i] = (i%msm_size < 10)? test_scalar::rand_host() : scalars[i-10];
-    points[i] = (i%msm_size < 10)? test_projective::to_affine(test_projective::rand_host()): points[i-10];
+    points[i] = (i % msm_size < 10) ? test_projective::to_affine(test_projective::rand_host()) : points[i - 10];
    scalars[i] = test_scalar::rand_host();
    // scalars[i] = i < N/2? test_scalar::rand_host() : test_scalar::one();
    // points[i] = test_projective::to_affine(test_projective::rand_host());
  }
-  std::cout<<"finished generating"<<std::endl;
+  std::cout << "finished generating" << std::endl;

  // projective_t *short_res = (projective_t*)malloc(sizeof(projective_t));
  // test_projective *large_res = (test_projective*)malloc(sizeof(test_projective));
-  test_projective large_res[batch_size*2];
+  test_projective large_res[batch_size * 2];
  // test_projective batched_large_res[batch_size];
  // fake_point *large_res = (fake_point*)malloc(sizeof(fake_point));
  // fake_point batched_large_res[256];

-
  // short_msm<scalar_t, projective_t, affine_t>(scalars, points, N, short_res);
  // for (unsigned i=0;i<batch_size;i++){
-    // large_msm<test_scalar, test_projective, test_affine>(scalars+msm_size*i, points+msm_size*i, msm_size, large_res+i, false);
-    // std::cout<<"final result large"<<std::endl;
-    // std::cout<<test_projective::to_affine(*large_res)<<std::endl;
+  // large_msm<test_scalar, test_projective, test_affine>(scalars+msm_size*i, points+msm_size*i, msm_size, large_res+i,
+  // false); std::cout<<"final result large"<<std::endl; std::cout<<test_projective::to_affine(*large_res)<<std::endl;
  // }

-  test_scalar *scalars_d;
-  test_affine *points_d;
-  test_projective *large_res_d;
+  test_scalar* scalars_d;
+  test_affine* points_d;
+  test_projective* large_res_d;

  cudaMalloc(&scalars_d, sizeof(test_scalar) * msm_size);
  cudaMalloc(&points_d, sizeof(test_affine) * msm_size);
  cudaMalloc(&large_res_d, sizeof(test_projective));
  cudaMemcpy(scalars_d, scalars, sizeof(test_scalar) * msm_size, cudaMemcpyHostToDevice);
  cudaMemcpy(points_d, points, sizeof(test_affine) * msm_size, cudaMemcpyHostToDevice);
-  
-  std::cout<<"finished copying"<<std::endl;

-  // batched_large_msm<test_scalar, test_projective, test_affine>(scalars, points, batch_size, msm_size, batched_large_res, false);
+  std::cout << "finished copying" << std::endl;
+
+  // batched_large_msm<test_scalar, test_projective, test_affine>(scalars, points, batch_size, msm_size,
+  // batched_large_res, false);
  cudaStream_t stream1;
  cudaStream_t stream2;
  cudaStreamCreate(&stream1);
  cudaStreamCreate(&stream2);
  auto begin1 = std::chrono::high_resolution_clock::now();
-  large_msm<test_scalar, test_projective, test_affine>(scalars, points, msm_size, large_res, false, true,stream1);
+  large_msm<test_scalar, test_projective, test_affine>(scalars, points, msm_size, large_res, false, true, stream1);
  auto end1 = std::chrono::high_resolution_clock::now();
  auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
  printf("Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9);
  // std::cout<<test_projective::to_affine(large_res[0])<<std::endl;
  auto begin = std::chrono::high_resolution_clock::now();
-  large_msm<test_scalar, test_projective, test_affine>(scalars_d, points_d, msm_size, large_res_d, true, false,stream2);
+  large_msm<test_scalar, test_projective, test_affine>(
+    scalars_d, points_d, msm_size, large_res_d, true, false, stream2);
  // test_reduce_triangle(scalars);
  // test_reduce_rectangle(scalars);
  // test_reduce_single(scalars);
@@ -201,17 +185,17 @@ int main()
  auto end = std::chrono::high_resolution_clock::now();
  auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
  printf("On Device No Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9);
-    cudaStreamSynchronize(stream1);
-    cudaStreamSynchronize(stream2);
-    cudaStreamDestroy(stream1);
-    cudaStreamDestroy(stream2);
+  cudaStreamSynchronize(stream1);
+  cudaStreamSynchronize(stream2);
+  cudaStreamDestroy(stream1);
+  cudaStreamDestroy(stream2);

-  std::cout<<test_projective::to_affine(large_res[0])<<std::endl;
+  std::cout << test_projective::to_affine(large_res[0]) << std::endl;

  cudaMemcpy(&large_res[1], large_res_d, sizeof(test_projective), cudaMemcpyDeviceToHost);
-  std::cout<<test_projective::to_affine(large_res[1])<<std::endl;
+  std::cout << test_projective::to_affine(large_res[1]) << std::endl;

-//   reference_msm<test_affine, test_scalar, test_projective>(scalars, points, msm_size);
+  //   reference_msm<test_affine, test_scalar, test_projective>(scalars, points, msm_size);

  // std::cout<<"final results batched large"<<std::endl;
  // bool success = true;
@@ -230,7 +214,7 @@ int main()
  // if (success){
  //   std::cout<<"success!"<<std::endl;
  // }
-  
+
  // std::cout<<batched_large_res[0]<<std::endl;
  // std::cout<<batched_large_res[1]<<std::endl;
  // std::cout<<projective_t::to_affine(batched_large_res[0])<<std::endl;
--- a/icicle/appUtils/ntt/lde.cu
+++ b/icicle/appUtils/ntt/lde.cu
@@ -1,47 +1,60 @@
 #ifndef LDE
 #define LDE
-#include <cuda.h>
-#include "ntt.cuh"
-#include "lde.cuh"
 #include "../vector_manipulation/ve_mod_mult.cuh"
+#include "lde.cuh"
+#include "ntt.cuh"
+#include <cuda.h>
+
+template <typename E, bool SUB>
+__global__ void add_sub_array(E* res, E* in1, E* in2, uint32_t n)
+{
+  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (tid < n) { res[tid] = SUB ? in1[tid] - in2[tid] : in1[tid] + in2[tid]; }
+}
+
+template <typename E>
+int sub_polys(E* d_out, E* d_in1, E* d_in2, unsigned n, cudaStream_t stream)
+{
+  uint32_t NUM_THREADS = MAX_THREADS_BATCH;
+  uint32_t NUM_BLOCKS = (n + NUM_THREADS - 1) / NUM_THREADS;
+
+  add_sub_array<E, true><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_out, d_in1, d_in2, n);
+
+  return 0;
+}
+
+template <typename E>
+int add_polys(E* d_out, E* d_in1, E* d_in2, unsigned n, cudaStream_t stream)
+{
+  uint32_t NUM_THREADS = MAX_THREADS_BATCH;
+  uint32_t NUM_BLOCKS = (n + NUM_THREADS - 1) / NUM_THREADS;
+
+  add_sub_array<E, false><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_out, d_in1, d_in2, n);
+
+  return 0;
+}

-template < typename E, bool SUB > __global__ void add_sub_array(E* res, E* in1, E* in2, uint32_t n) {
-    int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (tid < n) {
-      res[tid] = SUB ? in1[tid] - in2[tid] : in1[tid] + in2[tid];
-    }
-  }
-  
-  template <typename E>
-  int sub_polys(E* d_out, E* d_in1, E* d_in2, unsigned n, cudaStream_t stream) {
-    uint32_t NUM_THREADS = MAX_THREADS_BATCH;
-    uint32_t NUM_BLOCKS = (n + NUM_THREADS - 1) / NUM_THREADS;
-  
-    add_sub_array <E, true> <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_out, d_in1, d_in2, n);
-  
-    return 0;
-  }
-  
-  template <typename E>
-  int add_polys(E* d_out, E* d_in1, E* d_in2, unsigned n, cudaStream_t stream) {
-    uint32_t NUM_THREADS = MAX_THREADS_BATCH;
-    uint32_t NUM_BLOCKS = (n + NUM_THREADS - 1) / NUM_THREADS;
-  
-    add_sub_array <E, false> <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_out, d_in1, d_in2, n);
-  
-    return 0;
-  }
-  
 /**
 * Interpolate a batch of polynomials from their evaluations on the same subgroup.
 * Note: this function does not preform any bit-reverse permutations on its inputs or outputs.
- * @param d_out The variable to write coefficients of the resulting polynomials into (the coefficients are in bit-reversed order if the evaluations weren't bit-reversed and vice-versa).
+ * @param d_out The variable to write coefficients of the resulting polynomials into (the coefficients are in
+ * bit-reversed order if the evaluations weren't bit-reversed and vice-versa).
 * @param d_evaluations Input array of evaluations of all polynomials of type E (elements).
 * @param d_domain Domain on which the polynomials are evaluated. Must be a subgroup.
 * @param n Length of `d_domain` array, also equal to the number of evaluations of each polynomial.
 * @param batch_size The size of the batch; the length of `d_evaluations` is `n` * `batch_size`.
 */
-template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size, bool coset, S * coset_powers, cudaStream_t stream) {
+template <typename E, typename S>
+int interpolate_batch(
+  E* d_out,
+  E* d_evaluations,
+  S* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  bool coset,
+  S* coset_powers,
+  cudaStream_t stream)
+{
  cudaMemcpyAsync(d_out, d_evaluations, sizeof(E) * n * batch_size, cudaMemcpyDeviceToDevice, stream);
  ntt_inplace_batch_template(d_out, d_domain, n, batch_size, true, coset, coset_powers, stream, true);
  return 0;
@@ -50,47 +63,63 @@ template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluat
 /**
 * Interpolate a polynomial from its evaluations on a subgroup.
 * Note: this function does not preform any bit-reverse permutations on its inputs or outputs.
- * @param d_out The variable to write coefficients of the resulting polynomial into (the coefficients are in bit-reversed order if the evaluations weren't bit-reversed and vice-versa).
+ * @param d_out The variable to write coefficients of the resulting polynomial into (the coefficients are in
+ * bit-reversed order if the evaluations weren't bit-reversed and vice-versa).
 * @param d_evaluations Input array of evaluations that have type E (elements).
 * @param d_domain Domain on which the polynomial is evaluated. Must be a subgroup.
 * @param n Length of `d_evaluations` and the size `d_domain` arrays (they should have equal length).
 */
-template <typename E, typename S> int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n, bool coset, S * coset_powers, cudaStream_t stream) {
-  return interpolate_batch <E, S> (d_out, d_evaluations, d_domain, n, 1, coset, coset_powers, stream);
+template <typename E, typename S>
+int interpolate(E* d_out, E* d_evaluations, S* d_domain, unsigned n, bool coset, S* coset_powers, cudaStream_t stream)
+{
+  return interpolate_batch<E, S>(d_out, d_evaluations, d_domain, n, 1, coset, coset_powers, stream);
 }

-template < typename E > __global__ void fill_array(E * arr, E val, uint32_t n) {
+template <typename E>
+__global__ void fill_array(E* arr, E val, uint32_t n)
+{
  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (tid < n) {
-    arr[tid] = val;
-  }
+  if (tid < n) { arr[tid] = val; }
 }

 /**
 * Evaluate a batch of polynomials on the same coset.
 * @param d_out The evaluations of the polynomials on coset `u` * `d_domain`.
- * @param d_coefficients Input array of coefficients of all polynomials of type E (elements) to be evaluated in-place on a coset.
+ * @param d_coefficients Input array of coefficients of all polynomials of type E (elements) to be evaluated in-place on
+ * a coset.
 * @param d_domain Domain on which the polynomials are evaluated (see `coset` flag). Must be a subgroup.
 * @param domain_size Length of `d_domain` array, on which the polynomial is computed.
 * @param n The number of coefficients, which might be different from `domain_size`.
 * @param batch_size The size of the batch; the length of `d_coefficients` is `n` * `batch_size`.
 * @param coset The flag that indicates whether to evaluate on a coset. If false, evaluate on a subgroup `d_domain`.
- * @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of the coset.
+ * @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of
+ * the coset.
 */
 template <typename E, typename S>
-int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, unsigned batch_size, bool coset, S * coset_powers, cudaStream_t stream) {
+int evaluate_batch(
+  E* d_out,
+  E* d_coefficients,
+  S* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  bool coset,
+  S* coset_powers,
+  cudaStream_t stream)
+{
  uint32_t logn = uint32_t(log(domain_size) / log(2));
  if (domain_size > n) {
    // allocate and initialize an array of stream handles to parallelize data copying across batches
-    cudaStream_t *memcpy_streams = (cudaStream_t *) malloc(batch_size * sizeof(cudaStream_t));
-    for (unsigned i = 0; i < batch_size; i++)
-    {
+    cudaStream_t* memcpy_streams = (cudaStream_t*)malloc(batch_size * sizeof(cudaStream_t));
+    for (unsigned i = 0; i < batch_size; i++) {
      cudaStreamCreate(&(memcpy_streams[i]));

-      cudaMemcpyAsync(&d_out[i * domain_size], &d_coefficients[i * n], n * sizeof(E), cudaMemcpyDeviceToDevice, memcpy_streams[i]);
+      cudaMemcpyAsync(
+        &d_out[i * domain_size], &d_coefficients[i * n], n * sizeof(E), cudaMemcpyDeviceToDevice, memcpy_streams[i]);
      uint32_t NUM_THREADS = MAX_THREADS_BATCH;
      uint32_t NUM_BLOCKS = (domain_size - n + NUM_THREADS - 1) / NUM_THREADS;
-      fill_array <E> <<<NUM_BLOCKS, NUM_THREADS, 0, memcpy_streams[i]>>> (&d_out[i * domain_size + n], E::zero(), domain_size - n);
+      fill_array<E>
+        <<<NUM_BLOCKS, NUM_THREADS, 0, memcpy_streams[i]>>>(&d_out[i * domain_size + n], E::zero(), domain_size - n);

      cudaStreamSynchronize(memcpy_streams[i]);
      cudaStreamDestroy(memcpy_streams[i]);
@@ -98,9 +127,8 @@ int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_
  } else
    cudaMemcpyAsync(d_out, d_coefficients, sizeof(E) * domain_size * batch_size, cudaMemcpyDeviceToDevice, stream);

-  if (coset)
-    batch_vector_mult(coset_powers, d_out, domain_size, batch_size, stream);
-  
+  if (coset) batch_vector_mult(coset_powers, d_out, domain_size, batch_size, stream);
+
  S* _null = nullptr;
  ntt_inplace_batch_template(d_out, d_domain, domain_size, batch_size, false, false, _null, stream, true);
  return 0;
@@ -108,102 +136,144 @@ int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_

 /**
 * Evaluate a polynomial on a coset.
- * Note: this function does not preform any bit-reverse permutations on its inputs or outputs, so the order of outputs is bit-reversed.
+ * Note: this function does not preform any bit-reverse permutations on its inputs or outputs, so the order of outputs
+ * is bit-reversed.
 * @param d_out The evaluations of the polynomial on coset `u` * `d_domain`.
 * @param d_coefficients Input array of coefficients of a polynomial of type E (elements).
 * @param d_domain Domain on which the polynomial is evaluated (see `coset` flag). Must be a subgroup.
 * @param domain_size Length of `d_domain` array, on which the polynomial is computed.
 * @param n The number of coefficients, which might be different from `domain_size`.
 * @param coset The flag that indicates whether to evaluate on a coset. If false, evaluate on a subgroup `d_domain`.
- * @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of the coset.
+ * @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of
+ * the coset.
 */
-template <typename E, typename S> 
-int evaluate(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, bool coset, S * coset_powers, cudaStream_t stream) {
-  return evaluate_batch <E, S> (d_out, d_coefficients, d_domain, domain_size, n, 1, coset, coset_powers, stream);
+template <typename E, typename S>
+int evaluate(
+  E* d_out,
+  E* d_coefficients,
+  S* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  bool coset,
+  S* coset_powers,
+  cudaStream_t stream)
+{
+  return evaluate_batch<E, S>(d_out, d_coefficients, d_domain, domain_size, n, 1, coset, coset_powers, stream);
 }

-template <typename S> 
-int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream) {
+template <typename S>
+int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream)
+{
  S* _null = nullptr;
  return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
 }

-template <typename S> 
-int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) {
+template <typename S>
+int interpolate_scalars_batch(
+  S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream)
+{
  S* _null = nullptr;
  return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
 }

-template <typename E, typename S> 
-int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream) {
+template <typename E, typename S>
+int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream)
+{
  S* _null = nullptr;
  return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
 }

-template <typename E, typename S> 
-int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) {
+template <typename E, typename S>
+int interpolate_points_batch(
+  E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream)
+{
  S* _null = nullptr;
  return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
 }

-template <typename S> 
-int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream) {
+template <typename S>
+int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream)
+{
  S* _null = nullptr;
  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
 }

-template <typename S> 
-int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream) {
+template <typename S>
+int evaluate_scalars_batch(
+  S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream)
+{
  S* _null = nullptr;
  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
 }

-template <typename E, typename S> 
-int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream) {
+template <typename E, typename S>
+int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream)
+{
  S* _null = nullptr;
  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
 }

-template <typename E, typename S> 
-int evaluate_points_batch(E* d_out, E* d_coefficients, S* d_domain, 
-                          unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream) {
+template <typename E, typename S>
+int evaluate_points_batch(
+  E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream)
+{
  S* _null = nullptr;
  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
 }

-template <typename S> 
-int interpolate_scalars_on_coset(S* d_out, S* d_evaluations, S* d_domain,
-                                 unsigned n, S* coset_powers, cudaStream_t stream) {
+template <typename S>
+int interpolate_scalars_on_coset(
+  S* d_out, S* d_evaluations, S* d_domain, unsigned n, S* coset_powers, cudaStream_t stream)
+{
  return interpolate(d_out, d_evaluations, d_domain, n, true, coset_powers, stream);
 }

-template <typename S> 
-int interpolate_scalars_on_coset_batch(S* d_out, S* d_evaluations, S* d_domain,
-                                       unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream) {
+template <typename S>
+int interpolate_scalars_on_coset_batch(
+  S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream)
+{
  return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, true, coset_powers, stream);
 }

-template <typename S> 
-int evaluate_scalars_on_coset(S* d_out, S* d_coefficients, S* d_domain, 
-                              unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream) {
+template <typename S>
+int evaluate_scalars_on_coset(
+  S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream)
+{
  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
 }

-template <typename E, typename S> 
-int evaluate_scalars_on_coset_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, 
-                                    unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream) {
+template <typename E, typename S>
+int evaluate_scalars_on_coset_batch(
+  S* d_out,
+  S* d_coefficients,
+  S* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  S* coset_powers,
+  cudaStream_t stream)
+{
  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
 }

-template <typename E, typename S> 
-int evaluate_points_on_coset(E* d_out, E* d_coefficients, S* d_domain, 
-                             unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream) {
+template <typename E, typename S>
+int evaluate_points_on_coset(
+  E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream)
+{
  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
 }

-template <typename E, typename S> 
-int evaluate_points_on_coset_batch(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size,
-                                   unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream) {
+template <typename E, typename S>
+int evaluate_points_on_coset_batch(
+  E* d_out,
+  E* d_coefficients,
+  S* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  S* coset_powers,
+  cudaStream_t stream)
+{
  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
 }
 #endif
--- a/icicle/appUtils/ntt/lde.cuh
+++ b/icicle/appUtils/ntt/lde.cuh
@@ -2,45 +2,62 @@
 #define LDE_H
 #pragma once

-template <typename S> 
+template <typename S>
 int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream);

-template <typename S> 
-int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream);
+template <typename S>
+int interpolate_scalars_batch(
+  S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream);

-template <typename E, typename S> 
+template <typename E, typename S>
 int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream);

-template <typename E, typename S> 
-int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream);
+template <typename E, typename S>
+int interpolate_points_batch(
+  E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream);

-template <typename S> 
+template <typename S>
 int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream);

-template <typename S> 
-int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream);
+template <typename S>
+int evaluate_scalars_batch(
+  S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream);

-template <typename E, typename S> 
+template <typename E, typename S>
 int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream);

-template <typename E, typename S> 
-int evaluate_points_batch(E* d_out, E* d_coefficients, S* d_domain, 
-                          unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream);
+template <typename E, typename S>
+int evaluate_points_batch(
+  E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream);

-template <typename S> 
-int evaluate_scalars_on_coset(S* d_out, S* d_coefficients, S* d_domain, 
-                              unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream);
+template <typename S>
+int evaluate_scalars_on_coset(
+  S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream);

-template <typename S>                               
-int evaluate_scalars_on_coset_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, 
-                                    unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream);
+template <typename S>
+int evaluate_scalars_on_coset_batch(
+  S* d_out,
+  S* d_coefficients,
+  S* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  S* coset_powers,
+  cudaStream_t stream);

-template <typename E, typename S> 
-int evaluate_points_on_coset(E* d_out, E* d_coefficients, S* d_domain, 
-                             unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream);
+template <typename E, typename S>
+int evaluate_points_on_coset(
+  E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream);

-template <typename E, typename S> 
-int evaluate_points_on_coset_batch(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size,
-                                   unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream);
+template <typename E, typename S>
+int evaluate_points_on_coset_batch(
+  E* d_out,
+  E* d_coefficients,
+  S* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  S* coset_powers,
+  cudaStream_t stream);

 #endif
--- a/icicle/appUtils/ntt/ntt.cuh
+++ b/icicle/appUtils/ntt/ntt.cuh
@@ -6,18 +6,20 @@
 #include "../vector_manipulation/ve_mod_mult.cuh"

 const uint32_t MAX_NUM_THREADS = 1024;
-const uint32_t MAX_THREADS_BATCH = 512;    //TODO: allows 100% occupancy for scalar NTT for sm_86..sm_89
-const uint32_t MAX_SHARED_MEM_ELEMENT_SIZE = 32; //TODO: occupancy calculator, hardcoded for sm_86..sm_89
-const uint32_t MAX_SHARED_MEM = MAX_SHARED_MEM_ELEMENT_SIZE * 1024; 
+const uint32_t MAX_THREADS_BATCH = 512;          // TODO: allows 100% occupancy for scalar NTT for sm_86..sm_89
+const uint32_t MAX_SHARED_MEM_ELEMENT_SIZE = 32; // TODO: occupancy calculator, hardcoded for sm_86..sm_89
+const uint32_t MAX_SHARED_MEM = MAX_SHARED_MEM_ELEMENT_SIZE * 1024;

 /**
- * Computes the twiddle factors.  
+ * Computes the twiddle factors.
 * Outputs: d_twiddles[i] = omega^i.
- * @param d_twiddles input empty array. 
- * @param n_twiddles number of twiddle factors. 
- * @param omega multiplying factor. 
+ * @param d_twiddles input empty array.
+ * @param n_twiddles number of twiddle factors.
+ * @param omega multiplying factor.
 */
- template < typename S > __global__ void twiddle_factors_kernel(S * d_twiddles, uint32_t n_twiddles, S omega) {
+template <typename S>
+__global__ void twiddle_factors_kernel(S* d_twiddles, uint32_t n_twiddles, S omega)
+{
  for (uint32_t i = 0; i < n_twiddles; i++) {
    d_twiddles[i] = S::zero();
  }
@@ -28,21 +30,25 @@ const uint32_t MAX_SHARED_MEM = MAX_SHARED_MEM_ELEMENT_SIZE * 1024;
 }

 /**
- * Fills twiddles array with twiddle factors. 
- * @param twiddles input empty array. 
- * @param n_twiddles number of twiddle factors. 
- * @param omega multiplying factor. 
+ * Fills twiddles array with twiddle factors.
+ * @param twiddles input empty array.
+ * @param n_twiddles number of twiddle factors.
+ * @param omega multiplying factor.
 */
- template < typename S > S * fill_twiddle_factors_array(uint32_t n_twiddles, S omega, cudaStream_t stream) {
+template <typename S>
+S* fill_twiddle_factors_array(uint32_t n_twiddles, S omega, cudaStream_t stream)
+{
  size_t size_twiddles = n_twiddles * sizeof(S);
-  S * d_twiddles;
-  cudaMallocAsync(& d_twiddles, size_twiddles, stream);
-  twiddle_factors_kernel<S> <<< 1, 1, 0, stream>>> (d_twiddles, n_twiddles, omega);
+  S* d_twiddles;
+  cudaMallocAsync(&d_twiddles, size_twiddles, stream);
+  twiddle_factors_kernel<S><<<1, 1, 0, stream>>>(d_twiddles, n_twiddles, omega);
  cudaStreamSynchronize(stream);
  return d_twiddles;
 }

-template < typename T > __global__ void reverse_order_kernel(T* arr, T* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size) {
+template <typename T>
+__global__ void reverse_order_kernel(T* arr, T* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size)
+{
  int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (threadId < n * batch_size) {
    int idx = threadId % n;
@@ -61,12 +67,14 @@ template < typename T > __global__ void reverse_order_kernel(T* arr, T* arr_reve
 * @param logn log(n).
 * @param batch_size the size of the batch.
 */
-template < typename T > void reverse_order_batch(T* arr, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream) {
+template <typename T>
+void reverse_order_batch(T* arr, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream)
+{
  T* arr_reversed;
  cudaMallocAsync(&arr_reversed, n * batch_size * sizeof(T), stream);
  int number_of_threads = MAX_THREADS_BATCH;
  int number_of_blocks = (n * batch_size + number_of_threads - 1) / number_of_threads;
-  reverse_order_kernel <<<number_of_blocks, number_of_threads, 0, stream>>> (arr, arr_reversed, n, logn, batch_size);
+  reverse_order_kernel<<<number_of_blocks, number_of_threads, 0, stream>>>(arr, arr_reversed, n, logn, batch_size);
  cudaMemcpyAsync(arr, arr_reversed, n * batch_size * sizeof(T), cudaMemcpyDeviceToDevice, stream);
  cudaFreeAsync(arr_reversed, stream);
 }
@@ -79,11 +87,12 @@ template < typename T > void reverse_order_batch(T* arr, uint32_t n, uint32_t lo
 * @param n length of `arr`.
 * @param logn log(n).
 */
-template < typename T > void reverse_order(T* arr, uint32_t n, uint32_t logn, cudaStream_t stream) {
+template <typename T>
+void reverse_order(T* arr, uint32_t n, uint32_t logn, cudaStream_t stream)
+{
  reverse_order_batch(arr, n, logn, 1, stream);
 }

-
 enum Decimation {
  NONE = 0,
  DIF = 1,
@@ -101,25 +110,29 @@ enum Decimation {
 * @param s log2(n) loop index.
 */
 template <typename E, typename S>
-__global__ void ntt_template_kernel_shared_rev(E *__restrict__ arr_g, uint32_t n, const S *__restrict__ r_twiddles, uint32_t n_twiddles, uint32_t max_task, uint32_t ss, uint32_t logn)
+__global__ void ntt_template_kernel_shared_rev(
+  E* __restrict__ arr_g,
+  uint32_t n,
+  const S* __restrict__ r_twiddles,
+  uint32_t n_twiddles,
+  uint32_t max_task,
+  uint32_t ss,
+  uint32_t logn)
 {
  SharedMemory<E> smem;
-  E *arr = smem.getPointer();
+  E* arr = smem.getPointer();

  uint32_t task = blockIdx.x;
  uint32_t loop_limit = blockDim.x;
  uint32_t chunks = n / (loop_limit * 2);
  uint32_t offset = (task / chunks) * n;
-  if (task < max_task)
-  {
+  if (task < max_task) {
    // flattened loop allows parallel processing
    uint32_t l = threadIdx.x;

-    if (l < loop_limit)
-    {
+    if (l < loop_limit) {
 #pragma unroll
-      for (; ss < logn; ss++)
-      {
+      for (; ss < logn; ss++) {
        int s = logn - ss - 1;
        bool is_beginning = ss == 0;
        bool is_end = ss == (logn - 1);
@@ -142,15 +155,12 @@ __global__ void ntt_template_kernel_shared_rev(E *__restrict__ arr_g, uint32_t n

        E u = is_beginning ? arr_g[offset + oij] : arr[oij];
        E v = is_beginning ? arr_g[offset + k] : arr[k];
-        if (is_end)
-        {
+        if (is_end) {
          arr_g[offset + oij] = u + v;
          arr_g[offset + k] = tw * (u - v);
-        }
-        else
-        {
+        } else {
          arr[oij] = u + v;
-          arr[k] = tw *(u - v);
+          arr[k] = tw * (u - v);
        }

        __syncthreads();
@@ -170,22 +180,27 @@ __global__ void ntt_template_kernel_shared_rev(E *__restrict__ arr_g, uint32_t n
 * @param s log2(n) loop index.
 */
 template <typename E, typename S>
-__global__ void ntt_template_kernel_shared(E *__restrict__ arr_g, uint32_t n, const S *__restrict__ r_twiddles, uint32_t n_twiddles, uint32_t max_task, uint32_t s, uint32_t logn)
+__global__ void ntt_template_kernel_shared(
+  E* __restrict__ arr_g,
+  uint32_t n,
+  const S* __restrict__ r_twiddles,
+  uint32_t n_twiddles,
+  uint32_t max_task,
+  uint32_t s,
+  uint32_t logn)
 {
  SharedMemory<E> smem;
-  E *arr = smem.getPointer();
+  E* arr = smem.getPointer();

  uint32_t task = blockIdx.x;
  uint32_t loop_limit = blockDim.x;
  uint32_t chunks = n / (loop_limit * 2);
  uint32_t offset = (task / chunks) * n;
-  if (task < max_task)
-  {
+  if (task < max_task) {
    // flattened loop allows parallel processing
    uint32_t l = threadIdx.x;

-    if (l < loop_limit)
-    {
+    if (l < loop_limit) {
 #pragma unroll
      for (; s < logn; s++) // TODO: this loop also can be unrolled
      {
@@ -204,17 +219,13 @@ __global__ void ntt_template_kernel_shared(E *__restrict__ arr_g, uint32_t n, co
        uint32_t k = oij + shift_s;
        S tw = r_twiddles[j * n_twiddles_div];

-
        E u = s == 0 ? arr_g[offset + oij] : arr[oij];
        E v = s == 0 ? arr_g[offset + k] : arr[k];
        v = tw * v;
-        if (s == (logn - 1))
-        {
+        if (s == (logn - 1)) {
          arr_g[offset + oij] = u + v;
          arr_g[offset + k] = u - v;
-        }
-        else
-        {
+        } else {
          arr[oij] = u + v;
          arr[k] = u - v;
        }
@@ -226,9 +237,9 @@ __global__ void ntt_template_kernel_shared(E *__restrict__ arr_g, uint32_t n, co
 }

 /**
- * Cooley-Tukey NTT. 
+ * Cooley-Tukey NTT.
 * NOTE! this function assumes that d_twiddles are located in the device memory.
- * @param arr input array of type E (elements). 
+ * @param arr input array of type E (elements).
 * @param n length of d_arr.
 * @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
 * @param n_twiddles length of twiddles.
@@ -236,26 +247,25 @@ __global__ void ntt_template_kernel_shared(E *__restrict__ arr_g, uint32_t n, co
 * @param s log2(n) loop index.
 */
 template <typename E, typename S>
-__global__ void ntt_template_kernel(E *arr, uint32_t n, S *twiddles, uint32_t n_twiddles, uint32_t max_task, uint32_t s, bool rev)
+__global__ void
+ntt_template_kernel(E* arr, uint32_t n, S* twiddles, uint32_t n_twiddles, uint32_t max_task, uint32_t s, bool rev)
 {
  int task = blockIdx.x;
  int chunks = n / (blockDim.x * 2);

-  if (task < max_task)
-  {
+  if (task < max_task) {
    // flattened loop allows parallel processing
    uint32_t l = threadIdx.x;
    uint32_t loop_limit = blockDim.x;

-    if (l < loop_limit)
-    {
+    if (l < loop_limit) {
      uint32_t ntw_i = task % chunks;

      uint32_t shift_s = 1 << s;
      uint32_t shift2_s = 1 << (s + 1);
      uint32_t n_twiddles_div = n_twiddles >> (s + 1);

-      l = ntw_i * blockDim.x + l; //to l from chunks to full
+      l = ntw_i * blockDim.x + l; // to l from chunks to full

      uint32_t j = l & (shift_s - 1);               // Equivalent to: l % (1 << s)
      uint32_t i = ((l >> s) * shift2_s) & (n - 1); // (..) % n (assuming n is power of 2)
@@ -278,18 +288,26 @@ __global__ void ntt_template_kernel(E *arr, uint32_t n, S *twiddles, uint32_t n_
 * NTT/INTT inplace batch
 * Note: this function does not preform any bit-reverse permutations on its inputs or outputs.
 * @param d_inout Array for inplace processing
- * @param d_twiddles 
+ * @param d_twiddles
 * @param n Length of `d_twiddles` array
 * @param batch_size The size of the batch; the length of `d_inout` is `n` * `batch_size`.
 * @param inverse true for iNTT
 * @param is_coset true for multiplication by coset
 * @param coset should be array of lenght n - or in case of lesser than n, right-padded with zeroes
- * @param stream CUDA stream   
+ * @param stream CUDA stream
 * @param is_sync_needed do perform sync of the supplied CUDA stream at the end of processing
 */
-template <typename E, typename S> void ntt_inplace_batch_template(
-  E * d_inout, S * d_twiddles, unsigned n, unsigned batch_size, bool inverse, 
-  bool is_coset, S * coset, cudaStream_t stream, bool is_sync_needed) 
+template <typename E, typename S>
+void ntt_inplace_batch_template(
+  E* d_inout,
+  S* d_twiddles,
+  unsigned n,
+  unsigned batch_size,
+  bool inverse,
+  bool is_coset,
+  S* coset,
+  cudaStream_t stream,
+  bool is_sync_needed)
 {
  const int logn = int(log(n) / log(2));
  bool is_shared_mem_enabled = sizeof(E) <= MAX_SHARED_MEM_ELEMENT_SIZE;
@@ -298,36 +316,41 @@ template <typename E, typename S> void ntt_inplace_batch_template(
  const int chunks = max(int((n / 2) / num_threads), 1);
  const int total_tasks = batch_size * chunks;
  int num_blocks = total_tasks;
-  const int shared_mem = 2 * num_threads * sizeof(E); // TODO: calculator, as shared mem size may be more efficient less then max to allow more concurrent blocks on SM
-  const int logn_shmem = is_shared_mem_enabled ? int(log(2 * num_threads) / log(2)) : 0; //TODO: shared memory support only for types <= 32 bytes
+  const int shared_mem = 2 * num_threads * sizeof(E); // TODO: calculator, as shared mem size may be more efficient less
+                                                      // then max to allow more concurrent blocks on SM
+  const int logn_shmem = is_shared_mem_enabled ? int(log(2 * num_threads) / log(2))
+                                               : 0; // TODO: shared memory support only for types <= 32 bytes

-  if (inverse) 
-  {
-    if (is_shared_mem_enabled) ntt_template_kernel_shared<<<num_blocks, num_threads, shared_mem, stream>>>(d_inout, 1 << logn_shmem, d_twiddles, n, total_tasks, 0, logn_shmem);
+  if (inverse) {
+    if (is_shared_mem_enabled)
+      ntt_template_kernel_shared<<<num_blocks, num_threads, shared_mem, stream>>>(
+        d_inout, 1 << logn_shmem, d_twiddles, n, total_tasks, 0, logn_shmem);

    for (int s = logn_shmem; s < logn; s++) // TODO: this loop also can be unrolled
-    { 
-      ntt_template_kernel <E, S> <<<num_blocks, num_threads, 0, stream>>>(d_inout, n, d_twiddles, n, total_tasks, s, false);
+    {
+      ntt_template_kernel<E, S>
+        <<<num_blocks, num_threads, 0, stream>>>(d_inout, n, d_twiddles, n, total_tasks, s, false);
    }

    if (is_coset) batch_vector_mult(coset, d_inout, n, batch_size, stream);

    num_threads = min(n / 2, MAX_NUM_THREADS);
    num_blocks = (n * batch_size + num_threads - 1) / num_threads;
-    template_normalize_kernel <E, S> <<<num_blocks, num_threads, 0, stream>>> (d_inout, n * batch_size, S::inv_log_size(logn)); 
-  }
-  else 
-  {
+    template_normalize_kernel<E, S>
+      <<<num_blocks, num_threads, 0, stream>>>(d_inout, n * batch_size, S::inv_log_size(logn));
+  } else {
    if (is_coset) batch_vector_mult(coset, d_inout, n, batch_size, stream);

    for (int s = logn - 1; s >= logn_shmem; s--) // TODO: this loop also can be unrolled
    {
      ntt_template_kernel<<<num_blocks, num_threads, 0, stream>>>(d_inout, n, d_twiddles, n, total_tasks, s, true);
    }
-    
-    if (is_shared_mem_enabled) ntt_template_kernel_shared_rev<<<num_blocks, num_threads, shared_mem, stream>>>(d_inout, 1 << logn_shmem, d_twiddles, n, total_tasks, 0, logn_shmem);
+
+    if (is_shared_mem_enabled)
+      ntt_template_kernel_shared_rev<<<num_blocks, num_threads, shared_mem, stream>>>(
+        d_inout, 1 << logn_shmem, d_twiddles, n, total_tasks, 0, logn_shmem);
  }
-  
+
  if (!is_sync_needed) return;

  cudaStreamSynchronize(stream);
@@ -335,30 +358,32 @@ template <typename E, typename S> void ntt_inplace_batch_template(

 /**
 * Cooley-Tukey (scalar) NTT.
- * This is a bached version - meaning it assumes than the input array 
+ * This is a bached version - meaning it assumes than the input array
 * consists of N arrays of size n. The function performs n-size NTT on each small array.
- * @param arr input array of type BLS12_381::scalar_t. 
- * @param arr_size number of total elements = n * N.  
+ * @param arr input array of type BLS12_381::scalar_t.
+ * @param arr_size number of total elements = n * N.
 * @param n size of batch.
- * @param inverse indicate if the result array should be normalized by n^(-1). 
+ * @param inverse indicate if the result array should be normalized by n^(-1).
 */
- template <typename E, typename S> uint32_t ntt_end2end_batch_template(E * arr, uint32_t arr_size, uint32_t n, bool inverse, cudaStream_t stream) {
+template <typename E, typename S>
+uint32_t ntt_end2end_batch_template(E* arr, uint32_t arr_size, uint32_t n, bool inverse, cudaStream_t stream)
+{
  int batches = int(arr_size / n);
  uint32_t logn = uint32_t(log(n) / log(2));
-  uint32_t n_twiddles = n; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order. 
+  uint32_t n_twiddles = n; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order.
  size_t size_E = arr_size * sizeof(E);
-  S * d_twiddles;
-  if (inverse){
+  S* d_twiddles;
+  if (inverse) {
    d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega_inv(logn), stream);
-  } else{
+  } else {
    d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega(logn), stream);
  }
-  E * d_arr;
-  cudaMallocAsync( & d_arr, size_E, stream);
+  E* d_arr;
+  cudaMallocAsync(&d_arr, size_E, stream);
  cudaMemcpyAsync(d_arr, arr, size_E, cudaMemcpyHostToDevice, stream);
  int NUM_THREADS = MAX_THREADS_BATCH;
  int NUM_BLOCKS = (batches + NUM_THREADS - 1) / NUM_THREADS;
-   
+
  S* _null = nullptr;
  ntt_inplace_batch_template(d_arr, d_twiddles, n, batches, inverse, false, _null, stream, false);

@@ -366,17 +391,19 @@ template <typename E, typename S> void ntt_inplace_batch_template(
  cudaFreeAsync(d_arr, stream);
  cudaFreeAsync(d_twiddles, stream);
  cudaStreamSynchronize(stream);
-  return 0; 
+  return 0;
 }

 /**
- * Cooley-Tukey (scalar) NTT. 
- * @param arr input array of type E (element). 
+ * Cooley-Tukey (scalar) NTT.
+ * @param arr input array of type E (element).
 * @param n length of d_arr.
- * @param inverse indicate if the result array should be normalized by n^(-1). 
+ * @param inverse indicate if the result array should be normalized by n^(-1).
 */
- template<typename E,typename S> uint32_t ntt_end2end_template(E * arr, uint32_t n, bool inverse, cudaStream_t stream) {
-  return ntt_end2end_batch_template <E, S> (arr, n, n, inverse, stream);
+template <typename E, typename S>
+uint32_t ntt_end2end_template(E* arr, uint32_t n, bool inverse, cudaStream_t stream)
+{
+  return ntt_end2end_batch_template<E, S>(arr, n, n, inverse, stream);
 }

 #endif
--- a/icicle/appUtils/poseidon/constants.cuh
+++ b/icicle/appUtils/poseidon/constants.cuh
@@ -1,27 +1,27 @@
 #pragma once

-#include <map>
-#include <stdexcept>
-#include <cassert>
-
+#include "constants/constants_11.h"
 #include "constants/constants_2.h"
 #include "constants/constants_4.h"
 #include "constants/constants_8.h"
-#include "constants/constants_11.h"
+#include <cassert>
+#include <map>
+#include <stdexcept>

-uint32_t partial_rounds_number_from_arity(const uint32_t arity) {
-    switch (arity) {
-        case 2:
-            return 55;
-        case 4:
-            return 56;
-        case 8:
-            return 57;
-        case 11:
-            return 57;
-        default:
-            throw std::invalid_argument( "unsupported arity" );
-    }
+uint32_t partial_rounds_number_from_arity(const uint32_t arity)
+{
+  switch (arity) {
+  case 2:
+    return 55;
+  case 4:
+    return 56;
+  case 8:
+    return 57;
+  case 11:
+    return 57;
+  default:
+    throw std::invalid_argument("unsupported arity");
+  }
 };

 // TO-DO: change to mapping
@@ -29,23 +29,24 @@ const uint32_t FULL_ROUNDS_DEFAULT = 4;

 // TO-DO: for now, the constants are only generated in bls12_381
 template <typename S>
-S * load_constants(const uint32_t arity) {
-    unsigned char * constants;
-    switch (arity) {
-        case 2:
-            constants = constants_2;
-            break;
-        case 4:
-            constants = constants_4;
-            break;
-        case 8:
-            constants = constants_8;
-            break;
-        case 11:
-            constants = constants_11;
-            break;
-        default:
-            throw std::invalid_argument( "unsupported arity" );
-    }
-    return reinterpret_cast< S * >(constants);
+S* load_constants(const uint32_t arity)
+{
+  unsigned char* constants;
+  switch (arity) {
+  case 2:
+    constants = constants_2;
+    break;
+  case 4:
+    constants = constants_4;
+    break;
+  case 8:
+    constants = constants_8;
+    break;
+  case 11:
+    constants = constants_11;
+    break;
+  default:
+    throw std::invalid_argument("unsupported arity");
+  }
+  return reinterpret_cast<S*>(constants);
 }
--- a/icicle/appUtils/poseidon/constants/constants_11.h
+++ b/icicle/appUtils/poseidon/constants/constants_11.h
--- a/icicle/appUtils/poseidon/constants/constants_2.h
+++ b/icicle/appUtils/poseidon/constants/constants_2.h
--- a/icicle/appUtils/poseidon/constants/constants_4.h
+++ b/icicle/appUtils/poseidon/constants/constants_4.h
--- a/icicle/appUtils/poseidon/constants/constants_8.h
+++ b/icicle/appUtils/poseidon/constants/constants_8.h
--- a/icicle/appUtils/poseidon/poseidon.cu
+++ b/icicle/appUtils/poseidon/poseidon.cu
@@ -1,273 +1,266 @@
 #include "poseidon.cuh"

 template <typename S>
-__global__ void prepare_poseidon_states(S * states, size_t number_of_states, S domain_tag, const PoseidonConfiguration<S> config) {
-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int state_number = idx / config.t;
-    if (state_number >= number_of_states) {
-        return;
-    }
-    int element_number = idx % config.t;
+__global__ void
+prepare_poseidon_states(S* states, size_t number_of_states, S domain_tag, const PoseidonConfiguration<S> config)
+{
+  int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int state_number = idx / config.t;
+  if (state_number >= number_of_states) { return; }
+  int element_number = idx % config.t;

-    S prepared_element;
+  S prepared_element;

-    // Domain separation
-    if (element_number == 0) {
-        prepared_element = domain_tag;
-    } else {
-        prepared_element = states[state_number * config.t + element_number - 1];
-    }
+  // Domain separation
+  if (element_number == 0) {
+    prepared_element = domain_tag;
+  } else {
+    prepared_element = states[state_number * config.t + element_number - 1];
+  }

-    // Add pre-round constant
-    prepared_element = prepared_element + config.round_constants[element_number];
+  // Add pre-round constant
+  prepared_element = prepared_element + config.round_constants[element_number];

-    // Store element in state
-    states[idx] = prepared_element;
+  // Store element in state
+  states[idx] = prepared_element;
 }

 template <typename S>
-__device__ __forceinline__ S sbox_alpha_five(S element) {
-    S result = S::sqr(element);
-    result = S::sqr(result);
-    return result * element;
+__device__ __forceinline__ S sbox_alpha_five(S element)
+{
+  S result = S::sqr(element);
+  result = S::sqr(result);
+  return result * element;
 }

 template <typename S>
-__device__ S vecs_mul_matrix(S element, S * matrix, int element_number, int vec_number, int size, S * shared_states) {
-    shared_states[threadIdx.x] = element;
-    __syncthreads();
+__device__ S vecs_mul_matrix(S element, S* matrix, int element_number, int vec_number, int size, S* shared_states)
+{
+  shared_states[threadIdx.x] = element;
+  __syncthreads();

-    element = S::zero();
-    for (int i = 0; i < size; i++) {
-        element = element + (shared_states[vec_number * size + i] * matrix[i * size + element_number]);
-    }
-    __syncthreads();
-    return element;
+  element = S::zero();
+  for (int i = 0; i < size; i++) {
+    element = element + (shared_states[vec_number * size + i] * matrix[i * size + element_number]);
+  }
+  __syncthreads();
+  return element;
 }

 template <typename S>
-__device__ S full_round(S element,
-                        size_t rc_offset,
-                        int local_state_number,
-                        int element_number,
-                        bool multiply_by_mds,
-                        bool add_round_constant,
-                        S * shared_states,
-                        const PoseidonConfiguration<S> config) {
-    element = sbox_alpha_five(element);
-    if (add_round_constant) {
-        element = element + config.round_constants[rc_offset + element_number];
-    }
+__device__ S full_round(
+  S element,
+  size_t rc_offset,
+  int local_state_number,
+  int element_number,
+  bool multiply_by_mds,
+  bool add_round_constant,
+  S* shared_states,
+  const PoseidonConfiguration<S> config)
+{
+  element = sbox_alpha_five(element);
+  if (add_round_constant) { element = element + config.round_constants[rc_offset + element_number]; }

-    // Multiply all the states by mds matrix
-    S * matrix = multiply_by_mds ? config.mds_matrix : config.non_sparse_matrix;
-    return vecs_mul_matrix(element, matrix, element_number, local_state_number, config.t, shared_states);
+  // Multiply all the states by mds matrix
+  S* matrix = multiply_by_mds ? config.mds_matrix : config.non_sparse_matrix;
+  return vecs_mul_matrix(element, matrix, element_number, local_state_number, config.t, shared_states);
 }

 // Execute full rounds
 template <typename S>
-__global__ void full_rounds(S * states, size_t number_of_states, size_t rc_offset, bool first_half, const PoseidonConfiguration<S> config) {
-    extern __shared__ S shared_states[];
+__global__ void full_rounds(
+  S* states, size_t number_of_states, size_t rc_offset, bool first_half, const PoseidonConfiguration<S> config)
+{
+  extern __shared__ S shared_states[];

-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int state_number = idx / config.t;
-    if (state_number >= number_of_states) {
-        return;
-    }
-    int local_state_number = threadIdx.x / config.t;
-    int element_number = idx % config.t;
+  int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int state_number = idx / config.t;
+  if (state_number >= number_of_states) { return; }
+  int local_state_number = threadIdx.x / config.t;
+  int element_number = idx % config.t;

-    for (int i = 0; i < config.full_rounds_half - 1; i++) {
-        states[idx] = full_round(states[idx],
-                                 rc_offset,
-                                 local_state_number,
-                                 element_number,
-                                 true,
-                                 true,
-                                 shared_states,
-                                 config);
-        rc_offset += config.t;
-    }
+  for (int i = 0; i < config.full_rounds_half - 1; i++) {
+    states[idx] =
+      full_round(states[idx], rc_offset, local_state_number, element_number, true, true, shared_states, config);
+    rc_offset += config.t;
+  }

-    states[idx] = full_round(states[idx],
-                             rc_offset,
-                             local_state_number,
-                             element_number,
-                             !first_half,
-                             first_half,
-                             shared_states,
-                             config);
+  states[idx] = full_round(
+    states[idx], rc_offset, local_state_number, element_number, !first_half, first_half, shared_states, config);
 }

 template <typename S>
-__device__ S partial_round(S * state,
-                                  size_t rc_offset,
-                                  int round_number,
-                                  const PoseidonConfiguration<S> config) {
-    S element = state[0];
-    element = sbox_alpha_five(element);
-    element = element + config.round_constants[rc_offset];
+__device__ S partial_round(S* state, size_t rc_offset, int round_number, const PoseidonConfiguration<S> config)
+{
+  S element = state[0];
+  element = sbox_alpha_five(element);
+  element = element + config.round_constants[rc_offset];

-    S * sparse_matrix = &config.sparse_matrices[(config.t * 2 - 1) * round_number];
+  S* sparse_matrix = &config.sparse_matrices[(config.t * 2 - 1) * round_number];

-    state[0] = element * sparse_matrix[0];
-    for (int i = 1; i < config.t; i++) {
-        state[0] = state[0] + (state[i] * sparse_matrix[i]);
-    }
+  state[0] = element * sparse_matrix[0];
+  for (int i = 1; i < config.t; i++) {
+    state[0] = state[0] + (state[i] * sparse_matrix[i]);
+  }

-    for (int i = 1; i < config.t; i++) {
-        state[i] = state[i] + (element * sparse_matrix[config.t + i - 1]);
-    }
+  for (int i = 1; i < config.t; i++) {
+    state[i] = state[i] + (element * sparse_matrix[config.t + i - 1]);
+  }
 }

 // Execute partial rounds
 template <typename S>
-__global__ void partial_rounds(S * states, size_t number_of_states, size_t rc_offset, const PoseidonConfiguration<S> config) {
-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (idx >= number_of_states) {
-        return;
-    }
+__global__ void
+partial_rounds(S* states, size_t number_of_states, size_t rc_offset, const PoseidonConfiguration<S> config)
+{
+  int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (idx >= number_of_states) { return; }

-    S * state = &states[idx * config.t];
+  S* state = &states[idx * config.t];

-    for (int i = 0; i < config.partial_rounds; i++) {
-        partial_round(state, rc_offset, i, config);
-        rc_offset++;
-    }
+  for (int i = 0; i < config.partial_rounds; i++) {
+    partial_round(state, rc_offset, i, config);
+    rc_offset++;
+  }
 }

 // These function is just doing copy from the states to the output
 template <typename S>
-__global__ void get_hash_results(S * states, size_t number_of_states, S * out, int t) {
-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (idx >= number_of_states) {
-        return;
-    }
+__global__ void get_hash_results(S* states, size_t number_of_states, S* out, int t)
+{
+  int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (idx >= number_of_states) { return; }

-    out[idx] = states[idx * t + 1];
+  out[idx] = states[idx * t + 1];
 }

 template <typename S>
-__host__ void Poseidon<S>::hash_blocks(const S * inp, size_t blocks, S * out, HashType hash_type, cudaStream_t stream) {
-    S * states;
+__host__ void Poseidon<S>::hash_blocks(const S* inp, size_t blocks, S* out, HashType hash_type, cudaStream_t stream)
+{
+  S* states;

-    // allocate memory for {blocks} states of {t} scalars each
-    if (cudaMallocAsync(&states, blocks * this->t * sizeof(S), stream) != cudaSuccess) {
-        throw std::runtime_error("Failed memory allocation on the device");
-    }
+  // allocate memory for {blocks} states of {t} scalars each
+  if (cudaMallocAsync(&states, blocks * this->t * sizeof(S), stream) != cudaSuccess) {
+    throw std::runtime_error("Failed memory allocation on the device");
+  }

-    // This is where the input matrix of size Arity x NumberOfBlocks is
-    // padded and coppied to device in a T x NumberOfBlocks matrix
-    cudaMemcpy2DAsync(states, this->t * sizeof(S),  // Device pointer and device pitch
-                 inp, (this->t - 1) * sizeof(S),    // Host pointer and pitch
-                 (this->t - 1) * sizeof(S), blocks, // Size of the source matrix (Arity x NumberOfBlocks)
-                 cudaMemcpyHostToDevice, stream);
+  // This is where the input matrix of size Arity x NumberOfBlocks is
+  // padded and coppied to device in a T x NumberOfBlocks matrix
+  cudaMemcpy2DAsync(
+    states, this->t * sizeof(S),       // Device pointer and device pitch
+    inp, (this->t - 1) * sizeof(S),    // Host pointer and pitch
+    (this->t - 1) * sizeof(S), blocks, // Size of the source matrix (Arity x NumberOfBlocks)
+    cudaMemcpyHostToDevice, stream);

-    size_t rc_offset = 0;
+  size_t rc_offset = 0;

-    // The logic behind this is that 1 thread only works on 1 element
-    // We have {t} elements in each state, and {blocks} states total
-    int number_of_threads = (256 / this->t) * this->t;
-    int hashes_per_block = number_of_threads / this->t;
-    int total_number_of_threads = blocks * this->t;
-    int number_of_blocks = total_number_of_threads / number_of_threads +
-        static_cast<bool>(total_number_of_threads % number_of_threads);
+  // The logic behind this is that 1 thread only works on 1 element
+  // We have {t} elements in each state, and {blocks} states total
+  int number_of_threads = (256 / this->t) * this->t;
+  int hashes_per_block = number_of_threads / this->t;
+  int total_number_of_threads = blocks * this->t;
+  int number_of_blocks =
+    total_number_of_threads / number_of_threads + static_cast<bool>(total_number_of_threads % number_of_threads);

-    // The partial rounds operates on the whole state, so we define
-    // the parallelism params for processing a single hash preimage per thread
-    int singlehash_block_size = 128;
-    int number_of_singlehash_blocks = blocks / singlehash_block_size + static_cast<bool>(blocks % singlehash_block_size);
+  // The partial rounds operates on the whole state, so we define
+  // the parallelism params for processing a single hash preimage per thread
+  int singlehash_block_size = 128;
+  int number_of_singlehash_blocks = blocks / singlehash_block_size + static_cast<bool>(blocks % singlehash_block_size);

-    // Pick the domain_tag accordinaly
-    S domain_tag;
-    switch (hash_type) {
-        case HashType::ConstInputLen:
-            domain_tag = this->const_input_no_pad_domain_tag;
-            break;
+  // Pick the domain_tag accordinaly
+  S domain_tag;
+  switch (hash_type) {
+  case HashType::ConstInputLen:
+    domain_tag = this->const_input_no_pad_domain_tag;
+    break;

-        case HashType::MerkleTree:
-            domain_tag = this->tree_domain_tag;
-    }
+  case HashType::MerkleTree:
+    domain_tag = this->tree_domain_tag;
+  }

-    #if !defined(__CUDA_ARCH__) && defined(DEBUG)
-    auto start_time = std::chrono::high_resolution_clock::now();
-    #endif
+#if !defined(__CUDA_ARCH__) && defined(DEBUG)
+  auto start_time = std::chrono::high_resolution_clock::now();
+#endif

-    // Domain separation and adding pre-round constants
-    prepare_poseidon_states <<< number_of_blocks, number_of_threads, 0, stream >>> (states, blocks, domain_tag, this->config);
-    rc_offset += this->t;
+  // Domain separation and adding pre-round constants
+  prepare_poseidon_states<<<number_of_blocks, number_of_threads, 0, stream>>>(states, blocks, domain_tag, this->config);
+  rc_offset += this->t;

-    #if !defined(__CUDA_ARCH__) && defined(DEBUG)
-    cudaStreamSynchronize(stream);
-    std::cout << "Domain separation: " << rc_offset << std::endl;
-    //print_buffer_from_cuda<S>(states, blocks * this->t);
+#if !defined(__CUDA_ARCH__) && defined(DEBUG)
+  cudaStreamSynchronize(stream);
+  std::cout << "Domain separation: " << rc_offset << std::endl;
+  // print_buffer_from_cuda<S>(states, blocks * this->t);

-    auto end_time = std::chrono::high_resolution_clock::now();
-    auto elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
-    std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
-    start_time = std::chrono::high_resolution_clock::now();
-    #endif
+  auto end_time = std::chrono::high_resolution_clock::now();
+  auto elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+  std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
+  start_time = std::chrono::high_resolution_clock::now();
+#endif

-    // execute half full rounds
-    full_rounds <<< number_of_blocks, number_of_threads, sizeof(S) * hashes_per_block * this->t, stream >>> (states, blocks, rc_offset, true, this->config);
-    rc_offset += this->t * this->config.full_rounds_half;
+  // execute half full rounds
+  full_rounds<<<number_of_blocks, number_of_threads, sizeof(S) * hashes_per_block* this->t, stream>>>(
+    states, blocks, rc_offset, true, this->config);
+  rc_offset += this->t * this->config.full_rounds_half;

-    #if !defined(__CUDA_ARCH__) && defined(DEBUG)
-    cudaStreamSynchronize(stream);
-    std::cout << "Full rounds 1. RCOFFSET: " << rc_offset << std::endl;
-    // print_buffer_from_cuda<S>(states, blocks * this->t);
+#if !defined(__CUDA_ARCH__) && defined(DEBUG)
+  cudaStreamSynchronize(stream);
+  std::cout << "Full rounds 1. RCOFFSET: " << rc_offset << std::endl;
+  // print_buffer_from_cuda<S>(states, blocks * this->t);

-    end_time = std::chrono::high_resolution_clock::now();
-    elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
-    std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
-    start_time = std::chrono::high_resolution_clock::now();
-    #endif
+  end_time = std::chrono::high_resolution_clock::now();
+  elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+  std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
+  start_time = std::chrono::high_resolution_clock::now();
+#endif

-    // execute partial rounds
-    partial_rounds <<< number_of_singlehash_blocks, singlehash_block_size, 0, stream >>> (states, blocks, rc_offset, this->config);
-    rc_offset += this->config.partial_rounds;
+  // execute partial rounds
+  partial_rounds<<<number_of_singlehash_blocks, singlehash_block_size, 0, stream>>>(
+    states, blocks, rc_offset, this->config);
+  rc_offset += this->config.partial_rounds;

-    #if !defined(__CUDA_ARCH__) && defined(DEBUG)
-    cudaStreamSynchronize(stream);
-    std::cout << "Partial rounds. RCOFFSET: " << rc_offset << std::endl;
-    //print_buffer_from_cuda<S>(states, blocks * this->t);
+#if !defined(__CUDA_ARCH__) && defined(DEBUG)
+  cudaStreamSynchronize(stream);
+  std::cout << "Partial rounds. RCOFFSET: " << rc_offset << std::endl;
+  // print_buffer_from_cuda<S>(states, blocks * this->t);

-    end_time = std::chrono::high_resolution_clock::now();
-    elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
-    std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
-    start_time = std::chrono::high_resolution_clock::now();
-    #endif
+  end_time = std::chrono::high_resolution_clock::now();
+  elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+  std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
+  start_time = std::chrono::high_resolution_clock::now();
+#endif

-    // execute half full rounds
-    full_rounds <<< number_of_blocks, number_of_threads, sizeof(S) * hashes_per_block * this->t, stream >>> (states, blocks, rc_offset, false, this->config);
+  // execute half full rounds
+  full_rounds<<<number_of_blocks, number_of_threads, sizeof(S) * hashes_per_block* this->t, stream>>>(
+    states, blocks, rc_offset, false, this->config);

-    #if !defined(__CUDA_ARCH__) && defined(DEBUG)
-    cudaStreamSynchronize(stream);
-    std::cout << "Full rounds 2. RCOFFSET: " << rc_offset << std::endl;
-    //print_buffer_from_cuda<S>(states, blocks * this->t);
-    end_time = std::chrono::high_resolution_clock::now();
-    elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
-    std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
-    start_time = std::chrono::high_resolution_clock::now();
-    #endif
+#if !defined(__CUDA_ARCH__) && defined(DEBUG)
+  cudaStreamSynchronize(stream);
+  std::cout << "Full rounds 2. RCOFFSET: " << rc_offset << std::endl;
+  // print_buffer_from_cuda<S>(states, blocks * this->t);
+  end_time = std::chrono::high_resolution_clock::now();
+  elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+  std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
+  start_time = std::chrono::high_resolution_clock::now();
+#endif

-    // get output
-    S * out_device;
-    cudaMalloc(&out_device, blocks * sizeof(S));
-    get_hash_results <<< number_of_singlehash_blocks, singlehash_block_size, 0, stream >>> (states, blocks, out_device, this->config.t);
+  // get output
+  S* out_device;
+  cudaMalloc(&out_device, blocks * sizeof(S));
+  get_hash_results<<<number_of_singlehash_blocks, singlehash_block_size, 0, stream>>>(
+    states, blocks, out_device, this->config.t);

-    #if !defined(__CUDA_ARCH__) && defined(DEBUG)
-    cudaStreamSynchronize(stream);
-    std::cout << "Get hash results" << std::endl;
-    end_time = std::chrono::high_resolution_clock::now();
-    elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
-    std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
-    #endif
-    cudaMemcpyAsync(out, out_device, blocks * sizeof(S), cudaMemcpyDeviceToHost, stream);
-    cudaFreeAsync(out_device, stream);
-    cudaFreeAsync(states, stream);
+#if !defined(__CUDA_ARCH__) && defined(DEBUG)
+  cudaStreamSynchronize(stream);
+  std::cout << "Get hash results" << std::endl;
+  end_time = std::chrono::high_resolution_clock::now();
+  elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+  std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
+#endif
+  cudaMemcpyAsync(out, out_device, blocks * sizeof(S), cudaMemcpyDeviceToHost, stream);
+  cudaFreeAsync(out_device, stream);
+  cudaFreeAsync(states, stream);

-    #if !defined(__CUDA_ARCH__) && defined(DEBUG)
-    cudaDeviceReset();
-    #endif
+#if !defined(__CUDA_ARCH__) && defined(DEBUG)
+  cudaDeviceReset();
+#endif
 }
--- a/icicle/appUtils/poseidon/poseidon.cuh
+++ b/icicle/appUtils/poseidon/poseidon.cuh
@@ -2,19 +2,20 @@
 #include "constants.cuh"

 #if !defined(__CUDA_ARCH__) && defined(DEBUG)
-#include <iostream>
-#include <iomanip>
-#include <string>
-#include <sstream>
 #include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <string>

 template <typename S>
-__host__ void print_buffer_from_cuda(S * device_ptr, size_t size, size_t t) {
-  S * buffer = static_cast< S * >(malloc(size * sizeof(S)));
+__host__ void print_buffer_from_cuda(S* device_ptr, size_t size, size_t t)
+{
+  S* buffer = static_cast<S*>(malloc(size * sizeof(S)));
  cudaMemcpy(buffer, device_ptr, size * sizeof(S), cudaMemcpyDeviceToHost);

  std::cout << "Start print" << std::endl;
-  for(int i = 0; i < size / t; i++) {
+  for (int i = 0; i < size / t; i++) {
    std::cout << "State #" << i << std::endl;
    for (int j = 0; j < t; j++) {
      std::cout << buffer[i * t + j] << std::endl;
@@ -28,136 +29,129 @@ __host__ void print_buffer_from_cuda(S * device_ptr, size_t size, size_t t) {

 #ifdef DEBUG
 template <typename S>
-__device__ void print_scalar(S element, int data) {
-    printf("D# %d, T# %d: 0x%08x%08x%08x%08x%08x%08x%08x%08x\n",
-           data,
-           threadIdx.x,
-           element.limbs_storage.limbs[0],
-           element.limbs_storage.limbs[1],
-           element.limbs_storage.limbs[2],
-           element.limbs_storage.limbs[3],
-           element.limbs_storage.limbs[4],
-           element.limbs_storage.limbs[5],
-           element.limbs_storage.limbs[6],
-           element.limbs_storage.limbs[7]
-    );
+__device__ void print_scalar(S element, int data)
+{
+  printf(
+    "D# %d, T# %d: 0x%08x%08x%08x%08x%08x%08x%08x%08x\n", data, threadIdx.x, element.limbs_storage.limbs[0],
+    element.limbs_storage.limbs[1], element.limbs_storage.limbs[2], element.limbs_storage.limbs[3],
+    element.limbs_storage.limbs[4], element.limbs_storage.limbs[5], element.limbs_storage.limbs[6],
+    element.limbs_storage.limbs[7]);
 }
 #endif

 template <typename S>
 struct PoseidonConfiguration {
-    uint32_t partial_rounds, full_rounds_half, t;
-    S * round_constants, * mds_matrix, * non_sparse_matrix, *sparse_matrices;
+  uint32_t partial_rounds, full_rounds_half, t;
+  S *round_constants, *mds_matrix, *non_sparse_matrix, *sparse_matrices;
 };

 template <typename S>
-class Poseidon {
-  public:
-    uint32_t t;
-    PoseidonConfiguration<S> config;
+class Poseidon
+{
+public:
+  uint32_t t;
+  PoseidonConfiguration<S> config;

-    enum HashType {
-        ConstInputLen,
-        MerkleTree,
-    };
+  enum HashType {
+    ConstInputLen,
+    MerkleTree,
+  };

-    Poseidon(const uint32_t arity, cudaStream_t stream) {
-        t = arity + 1;
-        this->config.t = t;
-        this->stream = stream;
+  Poseidon(const uint32_t arity, cudaStream_t stream)
+  {
+    t = arity + 1;
+    this->config.t = t;
+    this->stream = stream;

-        // Pre-calculate domain tags
-        // Domain tags will vary for different applications of Poseidon
-        uint32_t tree_domain_tag_value = 1;
-        tree_domain_tag_value = (tree_domain_tag_value << arity) - tree_domain_tag_value;
-        tree_domain_tag = S::from(tree_domain_tag_value);
+    // Pre-calculate domain tags
+    // Domain tags will vary for different applications of Poseidon
+    uint32_t tree_domain_tag_value = 1;
+    tree_domain_tag_value = (tree_domain_tag_value << arity) - tree_domain_tag_value;
+    tree_domain_tag = S::from(tree_domain_tag_value);

-        const_input_no_pad_domain_tag = S::one();
+    const_input_no_pad_domain_tag = S::one();

-        // TO-DO: implement binary shifts for scalar type
-        // const_input_no_pad_domain_tag = S::one() << 64;
-        // const_input_no_pad_domain_tag *= S::from(arity);
+    // TO-DO: implement binary shifts for scalar type
+    // const_input_no_pad_domain_tag = S::one() << 64;
+    // const_input_no_pad_domain_tag *= S::from(arity);

-        this->config.full_rounds_half = FULL_ROUNDS_DEFAULT;
-        this->config.partial_rounds = partial_rounds_number_from_arity(arity);
+    this->config.full_rounds_half = FULL_ROUNDS_DEFAULT;
+    this->config.partial_rounds = partial_rounds_number_from_arity(arity);

-        uint32_t round_constants_len = t * this->config.full_rounds_half * 2 + this->config.partial_rounds;
-        uint32_t mds_matrix_len = t * t;
-        uint32_t sparse_matrices_len = (t * 2 - 1) * this->config.partial_rounds;
+    uint32_t round_constants_len = t * this->config.full_rounds_half * 2 + this->config.partial_rounds;
+    uint32_t mds_matrix_len = t * t;
+    uint32_t sparse_matrices_len = (t * 2 - 1) * this->config.partial_rounds;

-        // All the constants are stored in a single file
-        S * constants = load_constants<S>(arity);
+    // All the constants are stored in a single file
+    S* constants = load_constants<S>(arity);

-        S * mds_offset = constants + round_constants_len;
-        S * non_sparse_offset = mds_offset + mds_matrix_len;
-        S * sparse_matrices_offset = non_sparse_offset + mds_matrix_len;
+    S* mds_offset = constants + round_constants_len;
+    S* non_sparse_offset = mds_offset + mds_matrix_len;
+    S* sparse_matrices_offset = non_sparse_offset + mds_matrix_len;

-        #if !defined(__CUDA_ARCH__) && defined(DEBUG)
-        std::cout << "P: " << this->config.partial_rounds << " F: " << this->config.full_rounds_half << std::endl;
-        #endif
+#if !defined(__CUDA_ARCH__) && defined(DEBUG)
+    std::cout << "P: " << this->config.partial_rounds << " F: " << this->config.full_rounds_half << std::endl;
+#endif

-        // Create streams for copying constants
-        cudaStream_t stream_copy_round_constants, stream_copy_mds_matrix, stream_copy_non_sparse, stream_copy_sparse_matrices;
-        cudaStreamCreate(&stream_copy_round_constants);
-        cudaStreamCreate(&stream_copy_mds_matrix);
-        cudaStreamCreate(&stream_copy_non_sparse);
-        cudaStreamCreate(&stream_copy_sparse_matrices);
-        
-        // Create events for copying constants
-        cudaEvent_t event_copied_round_constants, event_copy_mds_matrix, event_copy_non_sparse, event_copy_sparse_matrices;
-        cudaEventCreateWithFlags(&event_copied_round_constants, cudaEventDisableTiming);
-        cudaEventCreateWithFlags(&event_copy_mds_matrix, cudaEventDisableTiming);
-        cudaEventCreateWithFlags(&event_copy_non_sparse, cudaEventDisableTiming);
-        cudaEventCreateWithFlags(&event_copy_sparse_matrices, cudaEventDisableTiming);
+    // Create streams for copying constants
+    cudaStream_t stream_copy_round_constants, stream_copy_mds_matrix, stream_copy_non_sparse,
+      stream_copy_sparse_matrices;
+    cudaStreamCreate(&stream_copy_round_constants);
+    cudaStreamCreate(&stream_copy_mds_matrix);
+    cudaStreamCreate(&stream_copy_non_sparse);
+    cudaStreamCreate(&stream_copy_sparse_matrices);

-        // Malloc memory for copying constants
-        cudaMallocAsync(&this->config.round_constants, sizeof(S) * round_constants_len, stream_copy_round_constants);
-        cudaMallocAsync(&this->config.mds_matrix, sizeof(S) * mds_matrix_len, stream_copy_mds_matrix);
-        cudaMallocAsync(&this->config.non_sparse_matrix, sizeof(S) * mds_matrix_len, stream_copy_non_sparse);
-        cudaMallocAsync(&this->config.sparse_matrices, sizeof(S) * sparse_matrices_len, stream_copy_sparse_matrices);
+    // Create events for copying constants
+    cudaEvent_t event_copied_round_constants, event_copy_mds_matrix, event_copy_non_sparse, event_copy_sparse_matrices;
+    cudaEventCreateWithFlags(&event_copied_round_constants, cudaEventDisableTiming);
+    cudaEventCreateWithFlags(&event_copy_mds_matrix, cudaEventDisableTiming);
+    cudaEventCreateWithFlags(&event_copy_non_sparse, cudaEventDisableTiming);
+    cudaEventCreateWithFlags(&event_copy_sparse_matrices, cudaEventDisableTiming);

-        // Copy constants
-        cudaMemcpyAsync(this->config.round_constants, constants,
-            sizeof(S) * round_constants_len,
-            cudaMemcpyHostToDevice, stream_copy_round_constants
-        );
-        cudaMemcpyAsync(this->config.mds_matrix, mds_offset,
-            sizeof(S) * mds_matrix_len,
-            cudaMemcpyHostToDevice, stream_copy_mds_matrix
-        );
-        cudaMemcpyAsync(this->config.non_sparse_matrix, non_sparse_offset,
-            sizeof(S) * mds_matrix_len,
-            cudaMemcpyHostToDevice, stream_copy_non_sparse
-        );
-        cudaMemcpyAsync(this->config.sparse_matrices, sparse_matrices_offset,
-            sizeof(S) * sparse_matrices_len,
-            cudaMemcpyHostToDevice, stream_copy_sparse_matrices
-        );
+    // Malloc memory for copying constants
+    cudaMallocAsync(&this->config.round_constants, sizeof(S) * round_constants_len, stream_copy_round_constants);
+    cudaMallocAsync(&this->config.mds_matrix, sizeof(S) * mds_matrix_len, stream_copy_mds_matrix);
+    cudaMallocAsync(&this->config.non_sparse_matrix, sizeof(S) * mds_matrix_len, stream_copy_non_sparse);
+    cudaMallocAsync(&this->config.sparse_matrices, sizeof(S) * sparse_matrices_len, stream_copy_sparse_matrices);

-        // Record finished copying event for streams
-        cudaEventRecord(event_copied_round_constants, stream_copy_round_constants);
-        cudaEventRecord(event_copy_mds_matrix, stream_copy_mds_matrix);
-        cudaEventRecord(event_copy_non_sparse, stream_copy_non_sparse);
-        cudaEventRecord(event_copy_sparse_matrices, stream_copy_sparse_matrices);
+    // Copy constants
+    cudaMemcpyAsync(
+      this->config.round_constants, constants, sizeof(S) * round_constants_len, cudaMemcpyHostToDevice,
+      stream_copy_round_constants);
+    cudaMemcpyAsync(
+      this->config.mds_matrix, mds_offset, sizeof(S) * mds_matrix_len, cudaMemcpyHostToDevice, stream_copy_mds_matrix);
+    cudaMemcpyAsync(
+      this->config.non_sparse_matrix, non_sparse_offset, sizeof(S) * mds_matrix_len, cudaMemcpyHostToDevice,
+      stream_copy_non_sparse);
+    cudaMemcpyAsync(
+      this->config.sparse_matrices, sparse_matrices_offset, sizeof(S) * sparse_matrices_len, cudaMemcpyHostToDevice,
+      stream_copy_sparse_matrices);

-        // Main stream waits for copying to finish
-        cudaStreamWaitEvent(stream, event_copied_round_constants);
-        cudaStreamWaitEvent(stream, event_copy_mds_matrix);
-        cudaStreamWaitEvent(stream, event_copy_non_sparse);
-        cudaStreamWaitEvent(stream, event_copy_sparse_matrices);
-    }
+    // Record finished copying event for streams
+    cudaEventRecord(event_copied_round_constants, stream_copy_round_constants);
+    cudaEventRecord(event_copy_mds_matrix, stream_copy_mds_matrix);
+    cudaEventRecord(event_copy_non_sparse, stream_copy_non_sparse);
+    cudaEventRecord(event_copy_sparse_matrices, stream_copy_sparse_matrices);

-    ~Poseidon() {
-        cudaFreeAsync(this->config.round_constants, this->stream);
-        cudaFreeAsync(this->config.mds_matrix, this->stream);
-        cudaFreeAsync(this->config.non_sparse_matrix, this->stream);
-        cudaFreeAsync(this->config.sparse_matrices, this->stream);
-    }
+    // Main stream waits for copying to finish
+    cudaStreamWaitEvent(stream, event_copied_round_constants);
+    cudaStreamWaitEvent(stream, event_copy_mds_matrix);
+    cudaStreamWaitEvent(stream, event_copy_non_sparse);
+    cudaStreamWaitEvent(stream, event_copy_sparse_matrices);
+  }

-    // Hash multiple preimages in parallel
-    void hash_blocks(const S * inp, size_t blocks, S * out, HashType hash_type, cudaStream_t stream);
+  ~Poseidon()
+  {
+    cudaFreeAsync(this->config.round_constants, this->stream);
+    cudaFreeAsync(this->config.mds_matrix, this->stream);
+    cudaFreeAsync(this->config.non_sparse_matrix, this->stream);
+    cudaFreeAsync(this->config.sparse_matrices, this->stream);
+  }

-  private:
-    S tree_domain_tag, const_input_no_pad_domain_tag;
-    cudaStream_t stream;
+  // Hash multiple preimages in parallel
+  void hash_blocks(const S* inp, size_t blocks, S* out, HashType hash_type, cudaStream_t stream);
+
+private:
+  S tree_domain_tag, const_input_no_pad_domain_tag;
+  cudaStream_t stream;
 };
--- a/icicle/appUtils/poseidon/poseidon_test.cu
+++ b/icicle/appUtils/poseidon/poseidon_test.cu
--- a/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh
+++ b/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh
@@ -1,9 +1,8 @@
 #ifndef VEC_MULT
 #define VEC_MULT
 #pragma once
-#include <stdexcept>
 #include <cuda.h>
-
+#include <stdexcept>

 #define MAX_THREADS_PER_BLOCK 256

@@ -13,128 +12,124 @@
 * @param n size of arr.
 * @param n_inv scalar of type S (scalar).
 */
- template < typename E, typename S > __global__ void template_normalize_kernel(E * arr, uint32_t n, S scalar) {
-    int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (tid < n) {
-      arr[tid] = scalar * arr[tid];
-    }
-  }
+template <typename E, typename S>
+__global__ void template_normalize_kernel(E* arr, uint32_t n, S scalar)
+{
+  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (tid < n) { arr[tid] = scalar * arr[tid]; }
+}

 // TODO: headers for prototypes and .c .cpp .cu files for implementations
 template <typename E, typename S>
-__global__ void vectorModMult(S *scalar_vec, E *element_vec, E *result, size_t n_elments)
+__global__ void vectorModMult(S* scalar_vec, E* element_vec, E* result, size_t n_elments)
 {
-    int tid = blockDim.x * blockIdx.x + threadIdx.x;
-    if (tid < n_elments)
-    {
-        result[tid] = scalar_vec[tid] * element_vec[tid];
-    }
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < n_elments) { result[tid] = scalar_vec[tid] * element_vec[tid]; }
 }

 template <typename E, typename S>
-int vector_mod_mult(S *vec_a, E *vec_b, E *result, size_t n_elments, cudaStream_t stream) // TODO: in place so no need for third result vector
+int vector_mod_mult(S* vec_a, E* vec_b, E* result, size_t n_elments, cudaStream_t stream) // TODO: in place so no need
+                                                                                          // for third result vector
 {
-    // Set the grid and block dimensions
-    int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
-    int threads_per_block = MAX_THREADS_PER_BLOCK;
+  // Set the grid and block dimensions
+  int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
+  int threads_per_block = MAX_THREADS_PER_BLOCK;

-    // Allocate memory on the device for the input vectors, the output vector, and the modulus
-    S *d_vec_a;
-    E *d_vec_b, *d_result;
-    cudaMallocAsync(&d_vec_a, n_elments * sizeof(S), stream);
-    cudaMallocAsync(&d_vec_b, n_elments * sizeof(E), stream);
-    cudaMallocAsync(&d_result, n_elments * sizeof(E), stream);
+  // Allocate memory on the device for the input vectors, the output vector, and the modulus
+  S* d_vec_a;
+  E *d_vec_b, *d_result;
+  cudaMallocAsync(&d_vec_a, n_elments * sizeof(S), stream);
+  cudaMallocAsync(&d_vec_b, n_elments * sizeof(E), stream);
+  cudaMallocAsync(&d_result, n_elments * sizeof(E), stream);

-    // Copy the input vectors and the modulus from the host to the device
-    cudaMemcpyAsync(d_vec_a, vec_a, n_elments * sizeof(S), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(d_vec_b, vec_b, n_elments * sizeof(E), cudaMemcpyHostToDevice, stream);
+  // Copy the input vectors and the modulus from the host to the device
+  cudaMemcpyAsync(d_vec_a, vec_a, n_elments * sizeof(S), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(d_vec_b, vec_b, n_elments * sizeof(E), cudaMemcpyHostToDevice, stream);

-    // Call the kernel to perform element-wise modular multiplication
-    vectorModMult<<<num_blocks, threads_per_block, 0, stream>>>(d_vec_a, d_vec_b, d_result, n_elments);
+  // Call the kernel to perform element-wise modular multiplication
+  vectorModMult<<<num_blocks, threads_per_block, 0, stream>>>(d_vec_a, d_vec_b, d_result, n_elments);

-    cudaMemcpyAsync(result, d_result, n_elments * sizeof(E), cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(result, d_result, n_elments * sizeof(E), cudaMemcpyDeviceToHost, stream);

-    cudaFreeAsync(d_vec_a, stream);
-    cudaFreeAsync(d_vec_b, stream);
-    cudaFreeAsync(d_result, stream);
+  cudaFreeAsync(d_vec_a, stream);
+  cudaFreeAsync(d_vec_b, stream);
+  cudaFreeAsync(d_result, stream);

-    cudaStreamSynchronize(stream);
-    return 0;
+  cudaStreamSynchronize(stream);
+  return 0;
 }

 template <typename E, typename S>
-int vector_mod_mult_device(S *d_vec_a, E *d_vec_b, E *d_result, size_t n_elments) // TODO: in place so no need for third result vector
+int vector_mod_mult_device(
+  S* d_vec_a, E* d_vec_b, E* d_result, size_t n_elments) // TODO: in place so no need for third result vector
 {
-    // Set the grid and block dimensions
-    int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
-    int threads_per_block = MAX_THREADS_PER_BLOCK;
+  // Set the grid and block dimensions
+  int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
+  int threads_per_block = MAX_THREADS_PER_BLOCK;

-    // Call the kernel to perform element-wise modular multiplication
-    vectorModMult<<<num_blocks, threads_per_block>>>(d_vec_a, d_vec_b, d_result, n_elments);
-    return 0;
+  // Call the kernel to perform element-wise modular multiplication
+  vectorModMult<<<num_blocks, threads_per_block>>>(d_vec_a, d_vec_b, d_result, n_elments);
+  return 0;
 }

 template <typename E, typename S>
-__global__ void batchVectorMult(S *scalar_vec, E *element_vec, unsigned n_scalars, unsigned batch_size)
+__global__ void batchVectorMult(S* scalar_vec, E* element_vec, unsigned n_scalars, unsigned batch_size)
 {
-    int tid = blockDim.x * blockIdx.x + threadIdx.x;
-    if (tid < n_scalars * batch_size)
-    {
-        int scalar_id = tid % n_scalars;
-        element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];
-    }
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < n_scalars * batch_size) {
+    int scalar_id = tid % n_scalars;
+    element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];
+  }
 }

 template <typename E, typename S>
-int batch_vector_mult(S *scalar_vec, E *element_vec, unsigned n_scalars, unsigned batch_size, cudaStream_t stream)
+int batch_vector_mult(S* scalar_vec, E* element_vec, unsigned n_scalars, unsigned batch_size, cudaStream_t stream)
 {
-    // Set the grid and block dimensions
-    int NUM_THREADS = MAX_THREADS_PER_BLOCK;
-    int NUM_BLOCKS = (n_scalars * batch_size + NUM_THREADS - 1) / NUM_THREADS;
-    batchVectorMult<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(scalar_vec, element_vec, n_scalars, batch_size);
-    return 0;
+  // Set the grid and block dimensions
+  int NUM_THREADS = MAX_THREADS_PER_BLOCK;
+  int NUM_BLOCKS = (n_scalars * batch_size + NUM_THREADS - 1) / NUM_THREADS;
+  batchVectorMult<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(scalar_vec, element_vec, n_scalars, batch_size);
+  return 0;
 }

 template <typename E>
-__global__ void matrixVectorMult(E *matrix_elements, E *vector_elements, E *result, size_t dim)
+__global__ void matrixVectorMult(E* matrix_elements, E* vector_elements, E* result, size_t dim)
 {
-
-    int tid = blockDim.x * blockIdx.x + threadIdx.x;
-    if (tid < dim)
-    {
-        result[tid] = E::zero();
-        for (int i = 0; i < dim; i++)
-            result[tid] = result[tid] + matrix_elements[tid * dim + i] * vector_elements[i];
-    }
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < dim) {
+    result[tid] = E::zero();
+    for (int i = 0; i < dim; i++)
+      result[tid] = result[tid] + matrix_elements[tid * dim + i] * vector_elements[i];
+  }
 }

 template <typename E>
-int matrix_mod_mult(E *matrix_elements, E *vector_elements, E *result, size_t dim, cudaStream_t stream)
+int matrix_mod_mult(E* matrix_elements, E* vector_elements, E* result, size_t dim, cudaStream_t stream)
 {
-    // Set the grid and block dimensions
-    int num_blocks = (int)ceil((float)dim / MAX_THREADS_PER_BLOCK);
-    int threads_per_block = MAX_THREADS_PER_BLOCK;
+  // Set the grid and block dimensions
+  int num_blocks = (int)ceil((float)dim / MAX_THREADS_PER_BLOCK);
+  int threads_per_block = MAX_THREADS_PER_BLOCK;

-    // Allocate memory on the device for the input vectors, the output vector, and the modulus
-    E *d_matrix, *d_vector, *d_result;
-    cudaMallocAsync(&d_matrix, (dim * dim) * sizeof(E), stream);
-    cudaMallocAsync(&d_vector, dim * sizeof(E), stream);
-    cudaMallocAsync(&d_result, dim * sizeof(E), stream);
+  // Allocate memory on the device for the input vectors, the output vector, and the modulus
+  E *d_matrix, *d_vector, *d_result;
+  cudaMallocAsync(&d_matrix, (dim * dim) * sizeof(E), stream);
+  cudaMallocAsync(&d_vector, dim * sizeof(E), stream);
+  cudaMallocAsync(&d_result, dim * sizeof(E), stream);

-    // Copy the input vectors and the modulus from the host to the device
-    cudaMemcpyAsync(d_matrix, matrix_elements, (dim * dim) * sizeof(E), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(d_vector, vector_elements, dim * sizeof(E), cudaMemcpyHostToDevice, stream);
+  // Copy the input vectors and the modulus from the host to the device
+  cudaMemcpyAsync(d_matrix, matrix_elements, (dim * dim) * sizeof(E), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(d_vector, vector_elements, dim * sizeof(E), cudaMemcpyHostToDevice, stream);

-    // Call the kernel to perform element-wise modular multiplication
-    matrixVectorMult<<<num_blocks, threads_per_block, 0, stream>>>(d_matrix, d_vector, d_result, dim);
+  // Call the kernel to perform element-wise modular multiplication
+  matrixVectorMult<<<num_blocks, threads_per_block, 0, stream>>>(d_matrix, d_vector, d_result, dim);

-    cudaMemcpyAsync(result, d_result, dim * sizeof(E), cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(result, d_result, dim * sizeof(E), cudaMemcpyDeviceToHost, stream);

-    cudaFreeAsync(d_matrix, stream);
-    cudaFreeAsync(d_vector, stream);
-    cudaFreeAsync(d_result, stream);
+  cudaFreeAsync(d_matrix, stream);
+  cudaFreeAsync(d_vector, stream);
+  cudaFreeAsync(d_result, stream);

-    cudaStreamSynchronize(stream);
-    return 0;
+  cudaStreamSynchronize(stream);
+  return 0;
 }
 #endif
--- a/icicle/curves/bls12_377/curve_config.cuh
+++ b/icicle/curves/bls12_377/curve_config.cuh
@@ -9,17 +9,17 @@
 #include "params.cuh"

 namespace BLS12_377 {
-    typedef Field<PARAMS_BLS12_377::fp_config> scalar_field_t;
-    typedef scalar_field_t scalar_t;
-    typedef Field<PARAMS_BLS12_377::fq_config> point_field_t;
-    static constexpr point_field_t b = point_field_t{ PARAMS_BLS12_377::weierstrass_b };
-    typedef Projective<point_field_t, scalar_field_t, b> projective_t;
-    typedef Affine<point_field_t> affine_t;
-    #if defined(G2_DEFINED)
-    typedef ExtensionField<PARAMS_BLS12_377::fq_config> g2_point_field_t;
-    static constexpr g2_point_field_t b_g2 = g2_point_field_t{ point_field_t{ PARAMS_BLS12_377::weierstrass_b_g2_re },
-                                                               point_field_t{ PARAMS_BLS12_377::weierstrass_b_g2_im }};
-    typedef Projective<g2_point_field_t, scalar_field_t, b_g2> g2_projective_t;
-    typedef Affine<g2_point_field_t> g2_affine_t;
-    #endif
-}
+  typedef Field<PARAMS_BLS12_377::fp_config> scalar_field_t;
+  typedef scalar_field_t scalar_t;
+  typedef Field<PARAMS_BLS12_377::fq_config> point_field_t;
+  static constexpr point_field_t b = point_field_t{PARAMS_BLS12_377::weierstrass_b};
+  typedef Projective<point_field_t, scalar_field_t, b> projective_t;
+  typedef Affine<point_field_t> affine_t;
+#if defined(G2_DEFINED)
+  typedef ExtensionField<PARAMS_BLS12_377::fq_config> g2_point_field_t;
+  static constexpr g2_point_field_t b_g2 = g2_point_field_t{
+    point_field_t{PARAMS_BLS12_377::weierstrass_b_g2_re}, point_field_t{PARAMS_BLS12_377::weierstrass_b_g2_im}};
+  typedef Projective<g2_point_field_t, scalar_field_t, b_g2> g2_projective_t;
+  typedef Affine<g2_point_field_t> g2_affine_t;
+#endif
+} // namespace BLS12_377
--- a/icicle/curves/bls12_377/lde.cu
+++ b/icicle/curves/bls12_377/lde.cu
--- a/icicle/curves/bls12_377/msm.cu
+++ b/icicle/curves/bls12_377/msm.cu
@@ -1,186 +1,216 @@
 #ifndef _BLS12_377_MSM
 #define _BLS12_377_MSM
 #include "../../appUtils/msm/msm.cu"
-#include <stdexcept>
-#include <cuda.h>
 #include "curve_config.cuh"
+#include <cuda.h>
+#include <stdexcept>

-
-extern "C"
-int msm_cuda_bls12_377(BLS12_377::projective_t *out, BLS12_377::affine_t points[],
-              BLS12_377::scalar_t scalars[], size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int msm_cuda_bls12_377(
+  BLS12_377::projective_t* out,
+  BLS12_377::affine_t points[],
+  BLS12_377::scalar_t scalars[],
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {   
-        cudaStreamCreate(&stream);
-        large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(scalars, points, count, out, false, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(
+      scalars, points, count, out, false, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int msm_batch_cuda_bls12_377(BLS12_377::projective_t* out, BLS12_377::affine_t points[],
-                              BLS12_377::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int msm_batch_cuda_bls12_377(
+  BLS12_377::projective_t* out,
+  BLS12_377::affine_t points[],
+  BLS12_377::scalar_t scalars[],
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        batched_large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(
+      scalars, points, batch_size, msm_size, out, false, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

 /**
 * Commit to a polynomial using the MSM.
- * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
 * @param d_out Ouptut point to write the result to.
 * @param d_scalars Scalars for the MSM. Must be on device.
 * @param d_points Points for the MSM. Must be on device.
 * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
 */
-extern "C"
-int commit_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::scalar_t* d_scalars, BLS12_377::affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int commit_cuda_bls12_377(
+  BLS12_377::projective_t* d_out,
+  BLS12_377::scalar_t* d_scalars,
+  BLS12_377::affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }
- 
+
 /**
 * Commit to a batch of polynomials using the MSM.
- * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
 * @param d_out Ouptut point to write the results to.
 * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
 * @param d_points Points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM.
 * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
 * @param batch_size Size of the batch.
 */
-extern "C"
-int commit_batch_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::scalar_t* d_scalars, BLS12_377::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int commit_batch_cuda_bls12_377(
+  BLS12_377::projective_t* d_out,
+  BLS12_377::scalar_t* d_scalars,
+  BLS12_377::affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

 #if defined(G2_DEFINED)
-extern "C"
-int msm_g2_cuda_bls12_377(BLS12_377::g2_projective_t *out, BLS12_377::g2_affine_t points[],
-              BLS12_377::scalar_t scalars[], size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int msm_g2_cuda_bls12_377(
+  BLS12_377::g2_projective_t* out,
+  BLS12_377::g2_affine_t points[],
+  BLS12_377::scalar_t scalars[],
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {   
-        cudaStreamCreate(&stream);
-        large_msm<BLS12_377::scalar_t, BLS12_377::g2_projective_t, BLS12_377::g2_affine_t>(scalars, points, count, out, false, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    large_msm<BLS12_377::scalar_t, BLS12_377::g2_projective_t, BLS12_377::g2_affine_t>(
+      scalars, points, count, out, false, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int msm_batch_g2_cuda_bls12_377(BLS12_377::g2_projective_t* out, BLS12_377::g2_affine_t points[],
-                              BLS12_377::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int msm_batch_g2_cuda_bls12_377(
+  BLS12_377::g2_projective_t* out,
+  BLS12_377::g2_affine_t points[],
+  BLS12_377::scalar_t scalars[],
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        batched_large_msm<BLS12_377::scalar_t, BLS12_377::g2_projective_t, BLS12_377::g2_affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm<BLS12_377::scalar_t, BLS12_377::g2_projective_t, BLS12_377::g2_affine_t>(
+      scalars, points, batch_size, msm_size, out, false, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

 /**
 * Commit to a polynomial using the MSM in G2 group.
- * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
 * @param d_out Ouptut G2 point to write the result to.
 * @param d_scalars Scalars for the MSM. Must be on device.
 * @param d_points G2 affine points for the MSM. Must be on device.
 * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
 */
-extern "C"
-int commit_g2_cuda_bls12_377(BLS12_377::g2_projective_t* d_out, BLS12_377::scalar_t* d_scalars, BLS12_377::g2_affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int commit_g2_cuda_bls12_377(
+  BLS12_377::g2_projective_t* d_out,
+  BLS12_377::scalar_t* d_scalars,
+  BLS12_377::g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    // TODO: use device_id when working with multiple devices
-    (void)device_id;
-    try
-    {
-        cudaStreamCreate(&stream);
-        large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  // TODO: use device_id when working with multiple devices
+  (void)device_id;
+  try {
+    cudaStreamCreate(&stream);
+    large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }
- 
- /**
-  * Commit to a batch of polynomials using the MSM.
-  * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
-  * @param d_out Ouptut G2 point to write the results to.
-  * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
-  * @param d_points G2 affine points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM.
-  * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
-  * @param batch_size Size of the batch.
-  */
-extern "C"
-int commit_batch_g2_cuda_bls12_377(BLS12_377::g2_projective_t* d_out, BLS12_377::scalar_t* d_scalars, BLS12_377::g2_affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+
+/**
+ * Commit to a batch of polynomials using the MSM.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
+ * @param d_out Ouptut G2 point to write the results to.
+ * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
+ * @param d_points G2 affine points for the MSMs. Must be on device. It is assumed that this set of bases is used for
+ * each MSM.
+ * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
+ * @param batch_size Size of the batch.
+ */
+extern "C" int commit_batch_g2_cuda_bls12_377(
+  BLS12_377::g2_projective_t* d_out,
+  BLS12_377::scalar_t* d_scalars,
+  BLS12_377::g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    // TODO: use device_id when working with multiple devices
-    (void)device_id;
-    try
-    {
-        cudaStreamCreate(&stream);
-        batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  // TODO: use device_id when working with multiple devices
+  (void)device_id;
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }
 #endif
 #endif
--- a/icicle/curves/bls12_377/params.cuh
+++ b/icicle/curves/bls12_377/params.cuh
@@ -1,184 +1,329 @@
 #pragma once
 #include "../../utils/storage.cuh"

-namespace PARAMS_BLS12_377{
-  struct fp_config{
+namespace PARAMS_BLS12_377 {
+  struct fp_config {
    static constexpr unsigned limbs_count = 8;
    static constexpr unsigned omegas_count = 32;

-    static constexpr storage<limbs_count> modulus = {0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e};
-    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0x14230000, 0xa0000002, 0xb354edfd, 0xb86f6002, 0xc1689a3c, 0x34594aac, 0x2556cabd};
-    static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0x28460000, 0x40000004, 0x66a9dbfb, 0x70dec005, 0x82d13479, 0x68b29559, 0x4aad957a};
-    static constexpr storage<2*limbs_count> modulus_wide = {0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2*limbs_count> modulus_squared = {0x00000001, 0x14230000, 0xe0000002, 0xc7dd4d2f, 0x8585d003, 0x08ee1bd4, 0xe57fc56e, 0x7e7557e3, 0x483a709d, 0x1fdebb41, 0x5678f4e6, 0x8ea77334, 0xc19c3ec5, 0xd717de29, 0xe2340781, 0x015c8d01};
-    static constexpr storage<2*limbs_count> modulus_squared_2 = {0x00000002, 0x28460000, 0xc0000004, 0x8fba9a5f, 0x0b0ba007, 0x11dc37a9, 0xcaff8adc, 0xfceaafc7, 0x9074e13a, 0x3fbd7682, 0xacf1e9cc, 0x1d4ee668, 0x83387d8b, 0xae2fbc53, 0xc4680f03, 0x02b91a03};
-    static constexpr storage<2*limbs_count> modulus_squared_4 = {0x00000004, 0x508c0000, 0x80000008, 0x1f7534bf, 0x1617400f, 0x23b86f52, 0x95ff15b8, 0xf9d55f8f, 0x20e9c275, 0x7f7aed05, 0x59e3d398, 0x3a9dccd1, 0x0670fb16, 0x5c5f78a7, 0x88d01e07, 0x05723407};
+    static constexpr storage<limbs_count> modulus = {0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe,
+                                                     0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e};
+    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0x14230000, 0xa0000002, 0xb354edfd,
+                                                       0xb86f6002, 0xc1689a3c, 0x34594aac, 0x2556cabd};
+    static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0x28460000, 0x40000004, 0x66a9dbfb,
+                                                       0x70dec005, 0x82d13479, 0x68b29559, 0x4aad957a};
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0x00000001, 0x14230000, 0xe0000002, 0xc7dd4d2f, 0x8585d003, 0x08ee1bd4, 0xe57fc56e, 0x7e7557e3,
+      0x483a709d, 0x1fdebb41, 0x5678f4e6, 0x8ea77334, 0xc19c3ec5, 0xd717de29, 0xe2340781, 0x015c8d01};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0x00000002, 0x28460000, 0xc0000004, 0x8fba9a5f, 0x0b0ba007, 0x11dc37a9, 0xcaff8adc, 0xfceaafc7,
+      0x9074e13a, 0x3fbd7682, 0xacf1e9cc, 0x1d4ee668, 0x83387d8b, 0xae2fbc53, 0xc4680f03, 0x02b91a03};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x00000004, 0x508c0000, 0x80000008, 0x1f7534bf, 0x1617400f, 0x23b86f52, 0x95ff15b8, 0xf9d55f8f,
+      0x20e9c275, 0x7f7aed05, 0x59e3d398, 0x3a9dccd1, 0x0670fb16, 0x5c5f78a7, 0x88d01e07, 0x05723407};
    static constexpr unsigned modulus_bit_count = 253;
-    static constexpr storage<limbs_count> m = {0x151e79ea, 0xf5204c21, 0x8d69e258, 0xfd0a180b, 0xfaa80548, 0xe4e51e49, 0xc40b2c9e, 0x36d9491e};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r =  {0xfffffff3, 0x7d1c7fff, 0x6ffffff2, 0x7257f50f, 0x512c0fee, 0x16d81575, 0x2bbb9a9d, 0x0d4bda32};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x1beeec02, 0x4122dd1a, 0x74fee875, 0xbd1eae95, 0x27b28e2f, 0x838557e2, 0x2290c02c, 0x07b30191};
+    static constexpr storage<limbs_count> m = {0x151e79ea, 0xf5204c21, 0x8d69e258, 0xfd0a180b,
+                                               0xfaa80548, 0xe4e51e49, 0xc40b2c9e, 0x36d9491e};
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0xfffffff3, 0x7d1c7fff, 0x6ffffff2, 0x7257f50f,
+                                                          0x512c0fee, 0x16d81575, 0x2bbb9a9d, 0x0d4bda32};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x1beeec02, 0x4122dd1a, 0x74fee875, 0xbd1eae95,
+                                                              0x27b28e2f, 0x838557e2, 0x2290c02c, 0x07b30191};

-    static constexpr storage<limbs_count> omega1= {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> omega2= {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> omega3= {0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e};
-    static constexpr storage<limbs_count> omega4= {0x00000001, 0x8f1a4000, 0xb0000001, 0xcf664765, 0x970dec00, 0x23ed1347, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> omega5= {0x0405f600, 0xfa8e7081, 0xf8a89660, 0x38b1c291, 0x6bda5fce, 0xefab9005, 0x92a3c754, 0x0b6b0756};
-    static constexpr storage<limbs_count> omega6= {0xaf0a50c8, 0xc5b2c78e, 0x4636deb3, 0x72e32a34, 0xb6f97778, 0x3d775d15, 0x2b16be6e, 0x0c4c070d};
-    static constexpr storage<limbs_count> omega7= {0x7a1ade2c, 0x3f5a4e73, 0x0120d1db, 0x71e5bca1, 0x3b2866fd, 0xbcb44162, 0x89c38db1, 0x06ed1a90};
-    static constexpr storage<limbs_count> omega8= {0xbd2cd25e, 0x61c5510e, 0x2b0d531c, 0xe2d70111, 0x94c3bd4b, 0x738f9894, 0x53182695, 0x0b1e0f1d};
-    static constexpr storage<limbs_count> omega9= {0x8cb9508c, 0xcfb2f75e, 0xf491e401, 0x4c14f244, 0x23c16afb, 0xc8f5265f, 0x70f3ff2a, 0x0cda7e27};
-    static constexpr storage<limbs_count> omega10= {0x0bdc32ee, 0xca77feb9, 0xd957f5a9, 0xf36ddfd4, 0x61ba14c4, 0x491c58f5, 0x93e8f339, 0x0618d3c9};
-    static constexpr storage<limbs_count> omega11= {0x2d89d82f, 0x68c3242e, 0x832a3729, 0xf9559645, 0xbceb62cc, 0x5c803c5e, 0x99ffa2f8, 0x1177cf5d};
-    static constexpr storage<limbs_count> omega12= {0x6932851a, 0xb6ed40f2, 0x1e0da12e, 0x79cbe7fb, 0x2a7d8f87, 0x8d408575, 0x7505d049, 0x11867341};
-    static constexpr storage<limbs_count> omega13= {0x07146cbf, 0x8cf7d87a, 0x109c4d23, 0x14ac37dc, 0x883e9660, 0x082d15f0, 0xad9ea9b8, 0x003719b1};
-    static constexpr storage<limbs_count> omega14= {0xfd0aee77, 0x2260e0dd, 0x1e33b6db, 0xc0cbbc3f, 0xfe7e1b36, 0xc8bf6747, 0x4cb802c1, 0x129e4fd5};
-    static constexpr storage<limbs_count> omega15= {0x8ac75741, 0x22f6fca2, 0xdd37b519, 0x8101b557, 0x1036226a, 0xf493bb8a, 0xfce05c2c, 0x06dbad6c};
-    static constexpr storage<limbs_count> omega16= {0x56733f8b, 0x7d246c24, 0xff70b46a, 0xbc3c4112, 0x6f13530b, 0x2c159b40, 0xc55d287b, 0x0c13137a};
-    static constexpr storage<limbs_count> omega17= {0xec8af73d, 0x8d24de3c, 0xcf722b45, 0x50f778d4, 0x15bc7dd7, 0xf4506bc3, 0xf94a16e1, 0x0e43ba91};
-    static constexpr storage<limbs_count> omega18= {0xd4405b8f, 0x0baa7b44, 0xee0f1394, 0xf8f3c7fe, 0xef0dfe6d, 0x46b153c0, 0x2dde6b95, 0x0ea2bcd9};
-    static constexpr storage<limbs_count> omega19= {0x3d1fa34e, 0x5f4dc975, 0x15af81db, 0xc28e54ee, 0x04947d99, 0x83d9a55f, 0x54a2b488, 0x08ec7ccf};
-    static constexpr storage<limbs_count> omega20= {0x0cac0ee8, 0x0d8fa7b3, 0x82ef38e4, 0x756284ed, 0xac8f90d2, 0x7014b194, 0x634e5d50, 0x092488f8};
-    static constexpr storage<limbs_count> omega21= {0x6d34ed69, 0xd85399bf, 0x09e49cef, 0x4d9012ba, 0xca00ae5d, 0x020142ee, 0x3bdfebfd, 0x12772e57};
-    static constexpr storage<limbs_count> omega22= {0x2eb41723, 0x676c8fc7, 0x5dd895bd, 0xe20380e2, 0x9bf22dde, 0x09dc8be8, 0x42638176, 0x12822f94};
-    static constexpr storage<limbs_count> omega23= {0x81a6d2de, 0x1f1df770, 0xcf29c812, 0x5d33b2da, 0x134f0e7e, 0x1bf162de, 0x1e2877a8, 0x045162c4};
-    static constexpr storage<limbs_count> omega24= {0xfecda1b6, 0x24f4503b, 0xded67d3c, 0x0e5d7ed3, 0x40cf20af, 0x2b7b7e5e, 0x4faad6af, 0x0d472650};
-    static constexpr storage<limbs_count> omega25= {0x584b9eb1, 0xcc6c474c, 0x15a8d886, 0x47670804, 0xbb8654c5, 0x07736d2f, 0xeb207a4b, 0x0d14ce7a};
-    static constexpr storage<limbs_count> omega26= {0xed25924a, 0xd1c6471c, 0x6bc312c3, 0xd98bb374, 0xfeae1a41, 0x50be0848, 0x3265c719, 0x04b07dea};
-    static constexpr storage<limbs_count> omega27= {0x618241e3, 0xab13f73e, 0x166ca902, 0x571c9267, 0x5e828a6d, 0x8586443a, 0x6daba50b, 0x093fdf2f};
-    static constexpr storage<limbs_count> omega28= {0xee11c34f, 0xe688e66b, 0xeacecf5a, 0xdc232eae, 0xb95ae685, 0x4fc35094, 0x7c1d31dc, 0x0273b5bd};
-    static constexpr storage<limbs_count> omega29= {0x1a9057bd, 0x8a8a5a77, 0x41834fbb, 0xdcbfae1d, 0xb34ede6e, 0x534f5b97, 0xb78bbd3e, 0x07313ac5};
-    static constexpr storage<limbs_count> omega30= {0x2be70731, 0x287abbb1, 0x7c35c5aa, 0x5cbcfd1e, 0x1671f4df, 0x7585b3fe, 0xb899c011, 0x08350ecf};
-    static constexpr storage<limbs_count> omega31= {0x09f7c5e2, 0x3400c14e, 0x0a649ea1, 0xc112e60c, 0x067ce95e, 0xf7510758, 0xf9daf17c, 0x040a66a5};
-    static constexpr storage<limbs_count> omega32= {0x43efecd3, 0x89d65957, 0x3bd6c318, 0x29246adc, 0xce01533c, 0xf9fb5ef6, 0x849078c3, 0x020410e4};
+    static constexpr storage<limbs_count> omega1 = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> omega2 = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> omega3 = {0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe,
+                                                    0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e};
+    static constexpr storage<limbs_count> omega4 = {0x00000001, 0x8f1a4000, 0xb0000001, 0xcf664765,
+                                                    0x970dec00, 0x23ed1347, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> omega5 = {0x0405f600, 0xfa8e7081, 0xf8a89660, 0x38b1c291,
+                                                    0x6bda5fce, 0xefab9005, 0x92a3c754, 0x0b6b0756};
+    static constexpr storage<limbs_count> omega6 = {0xaf0a50c8, 0xc5b2c78e, 0x4636deb3, 0x72e32a34,
+                                                    0xb6f97778, 0x3d775d15, 0x2b16be6e, 0x0c4c070d};
+    static constexpr storage<limbs_count> omega7 = {0x7a1ade2c, 0x3f5a4e73, 0x0120d1db, 0x71e5bca1,
+                                                    0x3b2866fd, 0xbcb44162, 0x89c38db1, 0x06ed1a90};
+    static constexpr storage<limbs_count> omega8 = {0xbd2cd25e, 0x61c5510e, 0x2b0d531c, 0xe2d70111,
+                                                    0x94c3bd4b, 0x738f9894, 0x53182695, 0x0b1e0f1d};
+    static constexpr storage<limbs_count> omega9 = {0x8cb9508c, 0xcfb2f75e, 0xf491e401, 0x4c14f244,
+                                                    0x23c16afb, 0xc8f5265f, 0x70f3ff2a, 0x0cda7e27};
+    static constexpr storage<limbs_count> omega10 = {0x0bdc32ee, 0xca77feb9, 0xd957f5a9, 0xf36ddfd4,
+                                                     0x61ba14c4, 0x491c58f5, 0x93e8f339, 0x0618d3c9};
+    static constexpr storage<limbs_count> omega11 = {0x2d89d82f, 0x68c3242e, 0x832a3729, 0xf9559645,
+                                                     0xbceb62cc, 0x5c803c5e, 0x99ffa2f8, 0x1177cf5d};
+    static constexpr storage<limbs_count> omega12 = {0x6932851a, 0xb6ed40f2, 0x1e0da12e, 0x79cbe7fb,
+                                                     0x2a7d8f87, 0x8d408575, 0x7505d049, 0x11867341};
+    static constexpr storage<limbs_count> omega13 = {0x07146cbf, 0x8cf7d87a, 0x109c4d23, 0x14ac37dc,
+                                                     0x883e9660, 0x082d15f0, 0xad9ea9b8, 0x003719b1};
+    static constexpr storage<limbs_count> omega14 = {0xfd0aee77, 0x2260e0dd, 0x1e33b6db, 0xc0cbbc3f,
+                                                     0xfe7e1b36, 0xc8bf6747, 0x4cb802c1, 0x129e4fd5};
+    static constexpr storage<limbs_count> omega15 = {0x8ac75741, 0x22f6fca2, 0xdd37b519, 0x8101b557,
+                                                     0x1036226a, 0xf493bb8a, 0xfce05c2c, 0x06dbad6c};
+    static constexpr storage<limbs_count> omega16 = {0x56733f8b, 0x7d246c24, 0xff70b46a, 0xbc3c4112,
+                                                     0x6f13530b, 0x2c159b40, 0xc55d287b, 0x0c13137a};
+    static constexpr storage<limbs_count> omega17 = {0xec8af73d, 0x8d24de3c, 0xcf722b45, 0x50f778d4,
+                                                     0x15bc7dd7, 0xf4506bc3, 0xf94a16e1, 0x0e43ba91};
+    static constexpr storage<limbs_count> omega18 = {0xd4405b8f, 0x0baa7b44, 0xee0f1394, 0xf8f3c7fe,
+                                                     0xef0dfe6d, 0x46b153c0, 0x2dde6b95, 0x0ea2bcd9};
+    static constexpr storage<limbs_count> omega19 = {0x3d1fa34e, 0x5f4dc975, 0x15af81db, 0xc28e54ee,
+                                                     0x04947d99, 0x83d9a55f, 0x54a2b488, 0x08ec7ccf};
+    static constexpr storage<limbs_count> omega20 = {0x0cac0ee8, 0x0d8fa7b3, 0x82ef38e4, 0x756284ed,
+                                                     0xac8f90d2, 0x7014b194, 0x634e5d50, 0x092488f8};
+    static constexpr storage<limbs_count> omega21 = {0x6d34ed69, 0xd85399bf, 0x09e49cef, 0x4d9012ba,
+                                                     0xca00ae5d, 0x020142ee, 0x3bdfebfd, 0x12772e57};
+    static constexpr storage<limbs_count> omega22 = {0x2eb41723, 0x676c8fc7, 0x5dd895bd, 0xe20380e2,
+                                                     0x9bf22dde, 0x09dc8be8, 0x42638176, 0x12822f94};
+    static constexpr storage<limbs_count> omega23 = {0x81a6d2de, 0x1f1df770, 0xcf29c812, 0x5d33b2da,
+                                                     0x134f0e7e, 0x1bf162de, 0x1e2877a8, 0x045162c4};
+    static constexpr storage<limbs_count> omega24 = {0xfecda1b6, 0x24f4503b, 0xded67d3c, 0x0e5d7ed3,
+                                                     0x40cf20af, 0x2b7b7e5e, 0x4faad6af, 0x0d472650};
+    static constexpr storage<limbs_count> omega25 = {0x584b9eb1, 0xcc6c474c, 0x15a8d886, 0x47670804,
+                                                     0xbb8654c5, 0x07736d2f, 0xeb207a4b, 0x0d14ce7a};
+    static constexpr storage<limbs_count> omega26 = {0xed25924a, 0xd1c6471c, 0x6bc312c3, 0xd98bb374,
+                                                     0xfeae1a41, 0x50be0848, 0x3265c719, 0x04b07dea};
+    static constexpr storage<limbs_count> omega27 = {0x618241e3, 0xab13f73e, 0x166ca902, 0x571c9267,
+                                                     0x5e828a6d, 0x8586443a, 0x6daba50b, 0x093fdf2f};
+    static constexpr storage<limbs_count> omega28 = {0xee11c34f, 0xe688e66b, 0xeacecf5a, 0xdc232eae,
+                                                     0xb95ae685, 0x4fc35094, 0x7c1d31dc, 0x0273b5bd};
+    static constexpr storage<limbs_count> omega29 = {0x1a9057bd, 0x8a8a5a77, 0x41834fbb, 0xdcbfae1d,
+                                                     0xb34ede6e, 0x534f5b97, 0xb78bbd3e, 0x07313ac5};
+    static constexpr storage<limbs_count> omega30 = {0x2be70731, 0x287abbb1, 0x7c35c5aa, 0x5cbcfd1e,
+                                                     0x1671f4df, 0x7585b3fe, 0xb899c011, 0x08350ecf};
+    static constexpr storage<limbs_count> omega31 = {0x09f7c5e2, 0x3400c14e, 0x0a649ea1, 0xc112e60c,
+                                                     0x067ce95e, 0xf7510758, 0xf9daf17c, 0x040a66a5};
+    static constexpr storage<limbs_count> omega32 = {0x43efecd3, 0x89d65957, 0x3bd6c318, 0x29246adc,
+                                                     0xce01533c, 0xf9fb5ef6, 0x849078c3, 0x020410e4};

    static constexpr storage_array<omegas_count, limbs_count> omega = {
-        omega1, omega2, omega3, omega4, omega5, omega6, omega7, omega8, 
-        omega9, omega10, omega11, omega12, omega13, omega14, omega15, omega16,
-        omega17, omega18, omega19, omega20, omega21, omega22, omega23, omega24,
-        omega25, omega26, omega27, omega28, omega29, omega30, omega31, omega32,
+      omega1,  omega2,  omega3,  omega4,  omega5,  omega6,  omega7,  omega8,  omega9,  omega10, omega11,
+      omega12, omega13, omega14, omega15, omega16, omega17, omega18, omega19, omega20, omega21, omega22,
+      omega23, omega24, omega25, omega26, omega27, omega28, omega29, omega30, omega31, omega32,
    };

-    static constexpr storage<limbs_count> omega_inv1= {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> omega_inv2= {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> omega_inv3= {0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e};
-    static constexpr storage<limbs_count> omega_inv4= {0x00000000, 0x7af74000, 0x1fffffff, 0x8a442f99, 0xc529c400, 0x3cc739d6, 0x9a2ca556, 0x12ab655e};
-    static constexpr storage<limbs_count> omega_inv5= {0x29f04fbb, 0x401766f3, 0x0a4b98b2, 0x7e4e5f63, 0x9fbc28da, 0x35887f12, 0xdabe3b97, 0x045cb225};
-    static constexpr storage<limbs_count> omega_inv6= {0xac4ce534, 0xf3883827, 0x7c4940f0, 0x9f9a114f, 0x32cc3182, 0xe48527ee, 0x2877f4c2, 0x02d4450c};
-    static constexpr storage<limbs_count> omega_inv7= {0x4afbf0bb, 0xd2533833, 0x1d646d56, 0x20987ba6, 0xb8ae7d61, 0xf2c34c11, 0xb53ae995, 0x09962e74};
-    static constexpr storage<limbs_count> omega_inv8= {0x34f5271a, 0xd6aeb755, 0x493bb125, 0xc0e24cfd, 0x35cf1879, 0xc9d2a1ad, 0x19000e58, 0x0f3570fa};
-    static constexpr storage<limbs_count> omega_inv9= {0xbec3ee61, 0x2601423e, 0xb5252af1, 0x94f5ab4b, 0x205d09ca, 0xa1184628, 0x82a1fba2, 0x0e305e1e};
-    static constexpr storage<limbs_count> omega_inv10= {0x7e3320f2, 0x3cbad3a7, 0x4269c624, 0x7866653a, 0xa2fc13a2, 0xaf6d742d, 0xfe24db2a, 0x03ed8246};
-    static constexpr storage<limbs_count> omega_inv11= {0x30cff7d3, 0xcb6ab09e, 0xd88db7e6, 0x29949e69, 0x24db3cd4, 0xb9117dc6, 0xca8d11b5, 0x01b2aadd};
-    static constexpr storage<limbs_count> omega_inv12= {0x433b851c, 0x1c8fbc5d, 0x545e622f, 0x0ccc3b8c, 0x5c624e0f, 0x0fba9df2, 0x0496ddf9, 0x02d54c5d};
-    static constexpr storage<limbs_count> omega_inv13= {0x0a176838, 0x2ddbbfdd, 0xc4c77f0f, 0xb7a1e4f3, 0x41cad032, 0x645b4383, 0xbfb123c4, 0x0f3fe2e3};
-    static constexpr storage<limbs_count> omega_inv14= {0x9ff30538, 0x1d6d50fe, 0x8576b6fa, 0xca07f2d2, 0x720da6d2, 0x587839fa, 0xe9ebd753, 0x0038d5aa};
-    static constexpr storage<limbs_count> omega_inv15= {0x8e30fb24, 0xaeac713d, 0x21906459, 0xd004e9e3, 0xa60b0a33, 0x2fc54303, 0x14e545a6, 0x039063f8};
-    static constexpr storage<limbs_count> omega_inv16= {0x74d36c47, 0x112559bd, 0x4154b77a, 0x87db7016, 0x3843df80, 0x9e779ae5, 0x297077d0, 0x024424f2};
-    static constexpr storage<limbs_count> omega_inv17= {0x65953c15, 0xd649ae5e, 0x56accc60, 0x879fe571, 0xa3ba1e39, 0xba914f52, 0xd6ea78a2, 0x01b74920};
-    static constexpr storage<limbs_count> omega_inv18= {0x3d8a82b4, 0x319dea45, 0x8fc703de, 0x49468894, 0xc6b00817, 0x703f710f, 0xe862bc53, 0x007762fd};
-    static constexpr storage<limbs_count> omega_inv19= {0x5bae083f, 0x4f433336, 0x27612fe3, 0x485e079c, 0x7f8f0a07, 0xf83b6572, 0xca91a4d4, 0x06bdcaaf};
-    static constexpr storage<limbs_count> omega_inv20= {0xb2fb63eb, 0x4a0bf5e7, 0x996004d9, 0x6f64f8ec, 0x67519c5e, 0x0fecd781, 0x1cab2760, 0x04475eb3};
-    static constexpr storage<limbs_count> omega_inv21= {0xcd83d14f, 0xadbd6ce4, 0x750b194a, 0xc664d3bc, 0x89c9f437, 0x3034dfed, 0xcc2e643b, 0x03d502b8};
-    static constexpr storage<limbs_count> omega_inv22= {0x2272320b, 0xf89478a9, 0xd2e658b7, 0x3adac024, 0x94b25831, 0xf38d840f, 0x37dc6c4c, 0x04540b1f};
-    static constexpr storage<limbs_count> omega_inv23= {0xa6d411fe, 0x19d969b1, 0xf544a648, 0x973f00f7, 0xc9ed9f93, 0xb18f166c, 0xe7f21124, 0x02fba68e};
-    static constexpr storage<limbs_count> omega_inv24= {0x94921227, 0x78b96b20, 0x23b35b65, 0x07cd90db, 0xc843f1c3, 0x111f4fd9, 0xff729f23, 0x0ec4b820};
-    static constexpr storage<limbs_count> omega_inv25= {0x4879d823, 0x53eb200b, 0x93095f4a, 0x1971fac3, 0x86989a58, 0x8467ffe6, 0x306ed29d, 0x0af20231};
-    static constexpr storage<limbs_count> omega_inv26= {0xd4793454, 0x71c907bd, 0x7700defb, 0xc11aa47e, 0xbac11769, 0xf03e0873, 0x97419136, 0x0353190d};
-    static constexpr storage<limbs_count> omega_inv27= {0xa81a701c, 0x61a3deb6, 0x91bbbecf, 0xd8a4eda1, 0x6feb65df, 0x3f5339b1, 0x8b5421f2, 0x108adc5b};
-    static constexpr storage<limbs_count> omega_inv28= {0xe7bf5a41, 0x7d6c573a, 0xfa83b1f7, 0x8038b697, 0xa6718ce9, 0x2a988bee, 0x1239b708, 0x0846f362};
-    static constexpr storage<limbs_count> omega_inv29= {0xe3373548, 0x89a068a4, 0x78a6c4e5, 0xf31284cf, 0x6e9396d6, 0x9eed5c8d, 0x7e4342f9, 0x01643c65};
-    static constexpr storage<limbs_count> omega_inv30= {0x123a81f6, 0xc03a3272, 0x115b15e8, 0x377e6d2f, 0x2d6d7206, 0xed5575e4, 0x714004f2, 0x0b1e37e4};
-    static constexpr storage<limbs_count> omega_inv31= {0xdde8ffc5, 0x62a29589, 0x618c5d62, 0xfb6716e8, 0x88d61f25, 0x787e561c, 0xd2b21c7e, 0x0e351761};
-    static constexpr storage<limbs_count> omega_inv32= {0x7aca7fbe, 0xc9fea0e9, 0xb41a8854, 0x965ff314, 0x810eea7e, 0x743415d4, 0x8275bbd1, 0x0431c01b};
-    
+    static constexpr storage<limbs_count> omega_inv1 = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                        0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> omega_inv2 = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                        0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> omega_inv3 = {0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe,
+                                                        0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e};
+    static constexpr storage<limbs_count> omega_inv4 = {0x00000000, 0x7af74000, 0x1fffffff, 0x8a442f99,
+                                                        0xc529c400, 0x3cc739d6, 0x9a2ca556, 0x12ab655e};
+    static constexpr storage<limbs_count> omega_inv5 = {0x29f04fbb, 0x401766f3, 0x0a4b98b2, 0x7e4e5f63,
+                                                        0x9fbc28da, 0x35887f12, 0xdabe3b97, 0x045cb225};
+    static constexpr storage<limbs_count> omega_inv6 = {0xac4ce534, 0xf3883827, 0x7c4940f0, 0x9f9a114f,
+                                                        0x32cc3182, 0xe48527ee, 0x2877f4c2, 0x02d4450c};
+    static constexpr storage<limbs_count> omega_inv7 = {0x4afbf0bb, 0xd2533833, 0x1d646d56, 0x20987ba6,
+                                                        0xb8ae7d61, 0xf2c34c11, 0xb53ae995, 0x09962e74};
+    static constexpr storage<limbs_count> omega_inv8 = {0x34f5271a, 0xd6aeb755, 0x493bb125, 0xc0e24cfd,
+                                                        0x35cf1879, 0xc9d2a1ad, 0x19000e58, 0x0f3570fa};
+    static constexpr storage<limbs_count> omega_inv9 = {0xbec3ee61, 0x2601423e, 0xb5252af1, 0x94f5ab4b,
+                                                        0x205d09ca, 0xa1184628, 0x82a1fba2, 0x0e305e1e};
+    static constexpr storage<limbs_count> omega_inv10 = {0x7e3320f2, 0x3cbad3a7, 0x4269c624, 0x7866653a,
+                                                         0xa2fc13a2, 0xaf6d742d, 0xfe24db2a, 0x03ed8246};
+    static constexpr storage<limbs_count> omega_inv11 = {0x30cff7d3, 0xcb6ab09e, 0xd88db7e6, 0x29949e69,
+                                                         0x24db3cd4, 0xb9117dc6, 0xca8d11b5, 0x01b2aadd};
+    static constexpr storage<limbs_count> omega_inv12 = {0x433b851c, 0x1c8fbc5d, 0x545e622f, 0x0ccc3b8c,
+                                                         0x5c624e0f, 0x0fba9df2, 0x0496ddf9, 0x02d54c5d};
+    static constexpr storage<limbs_count> omega_inv13 = {0x0a176838, 0x2ddbbfdd, 0xc4c77f0f, 0xb7a1e4f3,
+                                                         0x41cad032, 0x645b4383, 0xbfb123c4, 0x0f3fe2e3};
+    static constexpr storage<limbs_count> omega_inv14 = {0x9ff30538, 0x1d6d50fe, 0x8576b6fa, 0xca07f2d2,
+                                                         0x720da6d2, 0x587839fa, 0xe9ebd753, 0x0038d5aa};
+    static constexpr storage<limbs_count> omega_inv15 = {0x8e30fb24, 0xaeac713d, 0x21906459, 0xd004e9e3,
+                                                         0xa60b0a33, 0x2fc54303, 0x14e545a6, 0x039063f8};
+    static constexpr storage<limbs_count> omega_inv16 = {0x74d36c47, 0x112559bd, 0x4154b77a, 0x87db7016,
+                                                         0x3843df80, 0x9e779ae5, 0x297077d0, 0x024424f2};
+    static constexpr storage<limbs_count> omega_inv17 = {0x65953c15, 0xd649ae5e, 0x56accc60, 0x879fe571,
+                                                         0xa3ba1e39, 0xba914f52, 0xd6ea78a2, 0x01b74920};
+    static constexpr storage<limbs_count> omega_inv18 = {0x3d8a82b4, 0x319dea45, 0x8fc703de, 0x49468894,
+                                                         0xc6b00817, 0x703f710f, 0xe862bc53, 0x007762fd};
+    static constexpr storage<limbs_count> omega_inv19 = {0x5bae083f, 0x4f433336, 0x27612fe3, 0x485e079c,
+                                                         0x7f8f0a07, 0xf83b6572, 0xca91a4d4, 0x06bdcaaf};
+    static constexpr storage<limbs_count> omega_inv20 = {0xb2fb63eb, 0x4a0bf5e7, 0x996004d9, 0x6f64f8ec,
+                                                         0x67519c5e, 0x0fecd781, 0x1cab2760, 0x04475eb3};
+    static constexpr storage<limbs_count> omega_inv21 = {0xcd83d14f, 0xadbd6ce4, 0x750b194a, 0xc664d3bc,
+                                                         0x89c9f437, 0x3034dfed, 0xcc2e643b, 0x03d502b8};
+    static constexpr storage<limbs_count> omega_inv22 = {0x2272320b, 0xf89478a9, 0xd2e658b7, 0x3adac024,
+                                                         0x94b25831, 0xf38d840f, 0x37dc6c4c, 0x04540b1f};
+    static constexpr storage<limbs_count> omega_inv23 = {0xa6d411fe, 0x19d969b1, 0xf544a648, 0x973f00f7,
+                                                         0xc9ed9f93, 0xb18f166c, 0xe7f21124, 0x02fba68e};
+    static constexpr storage<limbs_count> omega_inv24 = {0x94921227, 0x78b96b20, 0x23b35b65, 0x07cd90db,
+                                                         0xc843f1c3, 0x111f4fd9, 0xff729f23, 0x0ec4b820};
+    static constexpr storage<limbs_count> omega_inv25 = {0x4879d823, 0x53eb200b, 0x93095f4a, 0x1971fac3,
+                                                         0x86989a58, 0x8467ffe6, 0x306ed29d, 0x0af20231};
+    static constexpr storage<limbs_count> omega_inv26 = {0xd4793454, 0x71c907bd, 0x7700defb, 0xc11aa47e,
+                                                         0xbac11769, 0xf03e0873, 0x97419136, 0x0353190d};
+    static constexpr storage<limbs_count> omega_inv27 = {0xa81a701c, 0x61a3deb6, 0x91bbbecf, 0xd8a4eda1,
+                                                         0x6feb65df, 0x3f5339b1, 0x8b5421f2, 0x108adc5b};
+    static constexpr storage<limbs_count> omega_inv28 = {0xe7bf5a41, 0x7d6c573a, 0xfa83b1f7, 0x8038b697,
+                                                         0xa6718ce9, 0x2a988bee, 0x1239b708, 0x0846f362};
+    static constexpr storage<limbs_count> omega_inv29 = {0xe3373548, 0x89a068a4, 0x78a6c4e5, 0xf31284cf,
+                                                         0x6e9396d6, 0x9eed5c8d, 0x7e4342f9, 0x01643c65};
+    static constexpr storage<limbs_count> omega_inv30 = {0x123a81f6, 0xc03a3272, 0x115b15e8, 0x377e6d2f,
+                                                         0x2d6d7206, 0xed5575e4, 0x714004f2, 0x0b1e37e4};
+    static constexpr storage<limbs_count> omega_inv31 = {0xdde8ffc5, 0x62a29589, 0x618c5d62, 0xfb6716e8,
+                                                         0x88d61f25, 0x787e561c, 0xd2b21c7e, 0x0e351761};
+    static constexpr storage<limbs_count> omega_inv32 = {0x7aca7fbe, 0xc9fea0e9, 0xb41a8854, 0x965ff314,
+                                                         0x810eea7e, 0x743415d4, 0x8275bbd1, 0x0431c01b};
+
    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
-        omega_inv1, omega_inv2, omega_inv3, omega_inv4, omega_inv5, omega_inv6, omega_inv7, omega_inv8, 
-        omega_inv9, omega_inv10, omega_inv11, omega_inv12, omega_inv13, omega_inv14, omega_inv15, omega_inv16,
-        omega_inv17, omega_inv18, omega_inv19, omega_inv20, omega_inv21, omega_inv22, omega_inv23, omega_inv24,
-        omega_inv25, omega_inv26, omega_inv27, omega_inv28, omega_inv29, omega_inv30, omega_inv31, omega_inv32,
+      omega_inv1,  omega_inv2,  omega_inv3,  omega_inv4,  omega_inv5,  omega_inv6,  omega_inv7,  omega_inv8,
+      omega_inv9,  omega_inv10, omega_inv11, omega_inv12, omega_inv13, omega_inv14, omega_inv15, omega_inv16,
+      omega_inv17, omega_inv18, omega_inv19, omega_inv20, omega_inv21, omega_inv22, omega_inv23, omega_inv24,
+      omega_inv25, omega_inv26, omega_inv27, omega_inv28, omega_inv29, omega_inv30, omega_inv31, omega_inv32,
    };

-    static constexpr storage<limbs_count> inv1= {0x00000001, 0x8508c000, 0x68000000, 0xacd53b7f, 0x2e1bd800, 0x305a268f, 0x4d1652ab, 0x0955b2af};
-    static constexpr storage<limbs_count> inv2= {0x00000001, 0xc78d2000, 0x1c000000, 0x033fd93f, 0xc529c401, 0xc88739d6, 0xf3a17c00, 0x0e008c06};
-    static constexpr storage<limbs_count> inv3= {0x00000001, 0xe8cf5000, 0xf6000000, 0x2e75281e, 0x90b0ba01, 0x949dc37a, 0xc6e710ab, 0x1055f8b2};
-    static constexpr storage<limbs_count> inv4= {0x00000001, 0xf9706800, 0xe3000000, 0x440fcf8e, 0x76743501, 0xfaa9084c, 0xb089db00, 0x1180af08};
-    static constexpr storage<limbs_count> inv5= {0x00000001, 0x01c0f400, 0xd9800001, 0x4edd2346, 0x6955f281, 0xadaeaab5, 0xa55b402b, 0x12160a33};
-    static constexpr storage<limbs_count> inv6= {0x00000001, 0x05e93a00, 0xd4c00001, 0x5443cd22, 0xe2c6d141, 0x07317be9, 0x1fc3f2c1, 0x1260b7c9};
-    static constexpr storage<limbs_count> inv7= {0x00000001, 0x07fd5d00, 0xd2600001, 0x56f72210, 0x1f7f40a1, 0xb3f2e484, 0xdcf84c0b, 0x12860e93};
-    static constexpr storage<limbs_count> inv8= {0x00000001, 0x09076e80, 0xd1300001, 0x5850cc87, 0x3ddb7851, 0x0a5398d1, 0x3b9278b1, 0x1298b9f9};
-    static constexpr storage<limbs_count> inv9= {0x00000001, 0x098c7740, 0x50980001, 0x58fda1c3, 0xcd099429, 0xb583f2f7, 0xeadf8f03, 0x12a20fab};
-    static constexpr storage<limbs_count> inv10= {0x00000001, 0x09cefba0, 0x104c0001, 0x59540c61, 0x14a0a215, 0x0b1c200b, 0x42861a2d, 0x12a6ba85};
-    static constexpr storage<limbs_count> inv11= {0x00000001, 0x09f03dd0, 0xf0260001, 0x597f41af, 0xb86c290b, 0xb5e83694, 0xee595fc1, 0x12a90ff1};
-    static constexpr storage<limbs_count> inv12= {0x00000001, 0x0a00dee8, 0x60130001, 0x5994dc57, 0x8a51ec86, 0x0b4e41d9, 0x4443028c, 0x12aa3aa8};
-    static constexpr storage<limbs_count> inv13= {0x00000001, 0x0a092f74, 0x18098001, 0xd99fa9ab, 0xf344ce43, 0x3601477b, 0x6f37d3f1, 0x12aad003};
-    static constexpr storage<limbs_count> inv14= {0x00000001, 0x0a0d57ba, 0xf404c001, 0x99a51054, 0x27be3f22, 0xcb5aca4d, 0x04b23ca3, 0x12ab1ab1};
-    static constexpr storage<limbs_count> inv15= {0x00000001, 0x0a0f6bdd, 0xe2026001, 0xf9a7c3a9, 0xc1faf791, 0x16078bb5, 0xcf6f70fd, 0x12ab4007};
-    static constexpr storage<limbs_count> inv16= {0x80000001, 0x0a1075ee, 0x59013001, 0xa9a91d54, 0x0f1953c9, 0xbb5dec6a, 0x34ce0b29, 0x12ab52b3};
-    static constexpr storage<limbs_count> inv17= {0x40000001, 0x0a10faf7, 0x94809801, 0x81a9ca29, 0x35a881e5, 0x0e091cc4, 0xe77d5840, 0x12ab5c08};
-    static constexpr storage<limbs_count> inv18= {0xa0000001, 0x0a113d7b, 0x32404c01, 0x6daa2094, 0x48f018f3, 0x375eb4f1, 0xc0d4fecb, 0x12ab60b3};
-    static constexpr storage<limbs_count> inv19= {0xd0000001, 0x0a115ebd, 0x81202601, 0x63aa4bc9, 0xd293e47a, 0xcc098107, 0x2d80d210, 0x12ab6309};
-    static constexpr storage<limbs_count> inv20= {0xe8000001, 0x0a116f5e, 0x28901301, 0xdeaa6164, 0x1765ca3d, 0x965ee713, 0xe3d6bbb3, 0x12ab6433};
-    static constexpr storage<limbs_count> inv21= {0x74000001, 0x0a1177af, 0x7c480981, 0x9c2a6c31, 0xb9cebd1f, 0xfb899a18, 0x3f01b084, 0x12ab64c9};
-    static constexpr storage<limbs_count> inv22= {0xba000001, 0x0a117bd7, 0x262404c1, 0x7aea7198, 0x8b033690, 0xae1ef39b, 0xec972aed, 0x12ab6513};
-    static constexpr storage<limbs_count> inv23= {0xdd000001, 0x0a117deb, 0x7b120261, 0xea4a744b, 0xf39d7348, 0x0769a05c, 0x4361e822, 0x12ab6539};
-    static constexpr storage<limbs_count> inv24= {0xee800001, 0x0a117ef5, 0x25890131, 0x21fa75a5, 0xa7ea91a5, 0x340ef6bd, 0xeec746bc, 0x12ab654b};
-    static constexpr storage<limbs_count> inv25= {0xf7400001, 0x0a117f7a, 0xfac48099, 0x3dd27651, 0x021120d3, 0x4a61a1ee, 0x4479f609, 0x12ab6555};
-    static constexpr storage<limbs_count> inv26= {0x7ba00001, 0x0a117fbd, 0x6562404d, 0x4bbe76a8, 0x2f24686a, 0xd58af786, 0xef534daf, 0x12ab6559};
-    static constexpr storage<limbs_count> inv27= {0xbdd00001, 0x0a117fde, 0x9ab12027, 0xd2b476d3, 0x45ae0c35, 0x1b1fa252, 0x44bff983, 0x12ab655c};
-    static constexpr storage<limbs_count> inv28= {0x5ee80001, 0x0a117fef, 0x35589014, 0x962f76e9, 0x50f2de1b, 0xbde9f7b8, 0x6f764f6c, 0x12ab655d};
-    static constexpr storage<limbs_count> inv29= {0xaf740001, 0x8a117ff7, 0x02ac480a, 0x77ecf6f4, 0x5695470e, 0x8f4f226b, 0x04d17a61, 0x12ab655e};
-    static constexpr storage<limbs_count> inv30= {0xd7ba0001, 0xca117ffb, 0x69562405, 0xe8cbb6f9, 0xd9667b87, 0xf801b7c4, 0x4f7f0fdb, 0x12ab655e};
-    static constexpr storage<limbs_count> inv31= {0xebdd0001, 0x6a117ffd, 0x1cab1203, 0xa13b16fc, 0x9acf15c4, 0x2c5b0271, 0x74d5da99, 0x12ab655e};
-    static constexpr storage<limbs_count> inv32= {0xf5ee8001, 0x3a117ffe, 0x76558902, 0xfd72c6fd, 0xfb8362e2, 0xc687a7c7, 0x87813ff7, 0x12ab655e};
+    static constexpr storage<limbs_count> inv1 = {0x00000001, 0x8508c000, 0x68000000, 0xacd53b7f,
+                                                  0x2e1bd800, 0x305a268f, 0x4d1652ab, 0x0955b2af};
+    static constexpr storage<limbs_count> inv2 = {0x00000001, 0xc78d2000, 0x1c000000, 0x033fd93f,
+                                                  0xc529c401, 0xc88739d6, 0xf3a17c00, 0x0e008c06};
+    static constexpr storage<limbs_count> inv3 = {0x00000001, 0xe8cf5000, 0xf6000000, 0x2e75281e,
+                                                  0x90b0ba01, 0x949dc37a, 0xc6e710ab, 0x1055f8b2};
+    static constexpr storage<limbs_count> inv4 = {0x00000001, 0xf9706800, 0xe3000000, 0x440fcf8e,
+                                                  0x76743501, 0xfaa9084c, 0xb089db00, 0x1180af08};
+    static constexpr storage<limbs_count> inv5 = {0x00000001, 0x01c0f400, 0xd9800001, 0x4edd2346,
+                                                  0x6955f281, 0xadaeaab5, 0xa55b402b, 0x12160a33};
+    static constexpr storage<limbs_count> inv6 = {0x00000001, 0x05e93a00, 0xd4c00001, 0x5443cd22,
+                                                  0xe2c6d141, 0x07317be9, 0x1fc3f2c1, 0x1260b7c9};
+    static constexpr storage<limbs_count> inv7 = {0x00000001, 0x07fd5d00, 0xd2600001, 0x56f72210,
+                                                  0x1f7f40a1, 0xb3f2e484, 0xdcf84c0b, 0x12860e93};
+    static constexpr storage<limbs_count> inv8 = {0x00000001, 0x09076e80, 0xd1300001, 0x5850cc87,
+                                                  0x3ddb7851, 0x0a5398d1, 0x3b9278b1, 0x1298b9f9};
+    static constexpr storage<limbs_count> inv9 = {0x00000001, 0x098c7740, 0x50980001, 0x58fda1c3,
+                                                  0xcd099429, 0xb583f2f7, 0xeadf8f03, 0x12a20fab};
+    static constexpr storage<limbs_count> inv10 = {0x00000001, 0x09cefba0, 0x104c0001, 0x59540c61,
+                                                   0x14a0a215, 0x0b1c200b, 0x42861a2d, 0x12a6ba85};
+    static constexpr storage<limbs_count> inv11 = {0x00000001, 0x09f03dd0, 0xf0260001, 0x597f41af,
+                                                   0xb86c290b, 0xb5e83694, 0xee595fc1, 0x12a90ff1};
+    static constexpr storage<limbs_count> inv12 = {0x00000001, 0x0a00dee8, 0x60130001, 0x5994dc57,
+                                                   0x8a51ec86, 0x0b4e41d9, 0x4443028c, 0x12aa3aa8};
+    static constexpr storage<limbs_count> inv13 = {0x00000001, 0x0a092f74, 0x18098001, 0xd99fa9ab,
+                                                   0xf344ce43, 0x3601477b, 0x6f37d3f1, 0x12aad003};
+    static constexpr storage<limbs_count> inv14 = {0x00000001, 0x0a0d57ba, 0xf404c001, 0x99a51054,
+                                                   0x27be3f22, 0xcb5aca4d, 0x04b23ca3, 0x12ab1ab1};
+    static constexpr storage<limbs_count> inv15 = {0x00000001, 0x0a0f6bdd, 0xe2026001, 0xf9a7c3a9,
+                                                   0xc1faf791, 0x16078bb5, 0xcf6f70fd, 0x12ab4007};
+    static constexpr storage<limbs_count> inv16 = {0x80000001, 0x0a1075ee, 0x59013001, 0xa9a91d54,
+                                                   0x0f1953c9, 0xbb5dec6a, 0x34ce0b29, 0x12ab52b3};
+    static constexpr storage<limbs_count> inv17 = {0x40000001, 0x0a10faf7, 0x94809801, 0x81a9ca29,
+                                                   0x35a881e5, 0x0e091cc4, 0xe77d5840, 0x12ab5c08};
+    static constexpr storage<limbs_count> inv18 = {0xa0000001, 0x0a113d7b, 0x32404c01, 0x6daa2094,
+                                                   0x48f018f3, 0x375eb4f1, 0xc0d4fecb, 0x12ab60b3};
+    static constexpr storage<limbs_count> inv19 = {0xd0000001, 0x0a115ebd, 0x81202601, 0x63aa4bc9,
+                                                   0xd293e47a, 0xcc098107, 0x2d80d210, 0x12ab6309};
+    static constexpr storage<limbs_count> inv20 = {0xe8000001, 0x0a116f5e, 0x28901301, 0xdeaa6164,
+                                                   0x1765ca3d, 0x965ee713, 0xe3d6bbb3, 0x12ab6433};
+    static constexpr storage<limbs_count> inv21 = {0x74000001, 0x0a1177af, 0x7c480981, 0x9c2a6c31,
+                                                   0xb9cebd1f, 0xfb899a18, 0x3f01b084, 0x12ab64c9};
+    static constexpr storage<limbs_count> inv22 = {0xba000001, 0x0a117bd7, 0x262404c1, 0x7aea7198,
+                                                   0x8b033690, 0xae1ef39b, 0xec972aed, 0x12ab6513};
+    static constexpr storage<limbs_count> inv23 = {0xdd000001, 0x0a117deb, 0x7b120261, 0xea4a744b,
+                                                   0xf39d7348, 0x0769a05c, 0x4361e822, 0x12ab6539};
+    static constexpr storage<limbs_count> inv24 = {0xee800001, 0x0a117ef5, 0x25890131, 0x21fa75a5,
+                                                   0xa7ea91a5, 0x340ef6bd, 0xeec746bc, 0x12ab654b};
+    static constexpr storage<limbs_count> inv25 = {0xf7400001, 0x0a117f7a, 0xfac48099, 0x3dd27651,
+                                                   0x021120d3, 0x4a61a1ee, 0x4479f609, 0x12ab6555};
+    static constexpr storage<limbs_count> inv26 = {0x7ba00001, 0x0a117fbd, 0x6562404d, 0x4bbe76a8,
+                                                   0x2f24686a, 0xd58af786, 0xef534daf, 0x12ab6559};
+    static constexpr storage<limbs_count> inv27 = {0xbdd00001, 0x0a117fde, 0x9ab12027, 0xd2b476d3,
+                                                   0x45ae0c35, 0x1b1fa252, 0x44bff983, 0x12ab655c};
+    static constexpr storage<limbs_count> inv28 = {0x5ee80001, 0x0a117fef, 0x35589014, 0x962f76e9,
+                                                   0x50f2de1b, 0xbde9f7b8, 0x6f764f6c, 0x12ab655d};
+    static constexpr storage<limbs_count> inv29 = {0xaf740001, 0x8a117ff7, 0x02ac480a, 0x77ecf6f4,
+                                                   0x5695470e, 0x8f4f226b, 0x04d17a61, 0x12ab655e};
+    static constexpr storage<limbs_count> inv30 = {0xd7ba0001, 0xca117ffb, 0x69562405, 0xe8cbb6f9,
+                                                   0xd9667b87, 0xf801b7c4, 0x4f7f0fdb, 0x12ab655e};
+    static constexpr storage<limbs_count> inv31 = {0xebdd0001, 0x6a117ffd, 0x1cab1203, 0xa13b16fc,
+                                                   0x9acf15c4, 0x2c5b0271, 0x74d5da99, 0x12ab655e};
+    static constexpr storage<limbs_count> inv32 = {0xf5ee8001, 0x3a117ffe, 0x76558902, 0xfd72c6fd,
+                                                   0xfb8362e2, 0xc687a7c7, 0x87813ff7, 0x12ab655e};

    static constexpr storage_array<omegas_count, limbs_count> inv = {
-        inv1, inv2, inv3, inv4, inv5, inv6, inv7, inv8, 
-        inv9, inv10, inv11, inv12, inv13, inv14, inv15, inv16,
-        inv17, inv18, inv19, inv20, inv21, inv22, inv23, inv24,
-        inv25, inv26, inv27, inv28, inv29, inv30, inv31, inv32,
-    };    
+      inv1,  inv2,  inv3,  inv4,  inv5,  inv6,  inv7,  inv8,  inv9,  inv10, inv11, inv12, inv13, inv14, inv15, inv16,
+      inv17, inv18, inv19, inv20, inv21, inv22, inv23, inv24, inv25, inv26, inv27, inv28, inv29, inv30, inv31, inv32,
+    };
  };

-  struct fq_config{
+  struct fq_config {
    static constexpr unsigned limbs_count = 12;
-    static constexpr storage<limbs_count> modulus = {0x00000001, 0x8508c000, 0x30000000, 0x170b5d44, 0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3, 0x6ca1493b, 0xc63b05c0, 0x17c510ea, 0x01ae3a46};
-    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0x0a118000, 0x60000001, 0x2e16ba88, 0x74129000, 0x3de6c45f, 0x01ea271e, 0x3445b3e6, 0xd9429276, 0x8c760b80, 0x2f8a21d5, 0x035c748c};
-    static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0x14230000, 0xc0000002, 0x5c2d7510, 0xe8252000, 0x7bcd88be, 0x03d44e3c, 0x688b67cc, 0xb28524ec, 0x18ec1701, 0x5f1443ab, 0x06b8e918};
-    static constexpr storage<2*limbs_count> modulus_wide = {0x00000001, 0x8508c000, 0x30000000, 0x170b5d44, 0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3, 0x6ca1493b, 0xc63b05c0, 0x17c510ea, 0x01ae3a46, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2*limbs_count> modulus_squared = {0x00000001, 0x0a118000, 0xf0000001, 0x7338d254, 0x2e1bd800, 0x4ada268f, 0x35f1c09a, 0x6bcbfbd2, 0x58638c9d, 0x318324b9, 0x8bb70ae0, 0x460aaaaa, 0x502a4d6c, 0xc014e712, 0xb90660cd, 0x09d018af, 0x3dda4d5c, 0x1f5e7141, 0xa4aee93f, 0x4bb8b87d, 0xb361263c, 0x2256913b, 0xd0bbaffb, 0x0002d307};
-    static constexpr storage<2*limbs_count> modulus_squared_2 = {0x00000002, 0x14230000, 0xe0000002, 0xe671a4a9, 0x5c37b000, 0x95b44d1e, 0x6be38134, 0xd797f7a4, 0xb0c7193a, 0x63064972, 0x176e15c0, 0x8c155555, 0xa0549ad8, 0x8029ce24, 0x720cc19b, 0x13a0315f, 0x7bb49ab8, 0x3ebce282, 0x495dd27e, 0x977170fb, 0x66c24c78, 0x44ad2277, 0xa1775ff6, 0x0005a60f};
-    static constexpr storage<2*limbs_count> modulus_squared_4 = {0x00000004, 0x28460000, 0xc0000004, 0xcce34953, 0xb86f6001, 0x2b689a3c, 0xd7c70269, 0xaf2fef48, 0x618e3275, 0xc60c92e5, 0x2edc2b80, 0x182aaaaa, 0x40a935b1, 0x00539c49, 0xe4198337, 0x274062be, 0xf7693570, 0x7d79c504, 0x92bba4fc, 0x2ee2e1f6, 0xcd8498f1, 0x895a44ee, 0x42eebfec, 0x000b4c1f};
+    static constexpr storage<limbs_count> modulus = {0x00000001, 0x8508c000, 0x30000000, 0x170b5d44,
+                                                     0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3,
+                                                     0x6ca1493b, 0xc63b05c0, 0x17c510ea, 0x01ae3a46};
+    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0x0a118000, 0x60000001, 0x2e16ba88,
+                                                       0x74129000, 0x3de6c45f, 0x01ea271e, 0x3445b3e6,
+                                                       0xd9429276, 0x8c760b80, 0x2f8a21d5, 0x035c748c};
+    static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0x14230000, 0xc0000002, 0x5c2d7510,
+                                                       0xe8252000, 0x7bcd88be, 0x03d44e3c, 0x688b67cc,
+                                                       0xb28524ec, 0x18ec1701, 0x5f1443ab, 0x06b8e918};
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0x00000001, 0x8508c000, 0x30000000, 0x170b5d44, 0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3,
+      0x6ca1493b, 0xc63b05c0, 0x17c510ea, 0x01ae3a46, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0x00000001, 0x0a118000, 0xf0000001, 0x7338d254, 0x2e1bd800, 0x4ada268f, 0x35f1c09a, 0x6bcbfbd2,
+      0x58638c9d, 0x318324b9, 0x8bb70ae0, 0x460aaaaa, 0x502a4d6c, 0xc014e712, 0xb90660cd, 0x09d018af,
+      0x3dda4d5c, 0x1f5e7141, 0xa4aee93f, 0x4bb8b87d, 0xb361263c, 0x2256913b, 0xd0bbaffb, 0x0002d307};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0x00000002, 0x14230000, 0xe0000002, 0xe671a4a9, 0x5c37b000, 0x95b44d1e, 0x6be38134, 0xd797f7a4,
+      0xb0c7193a, 0x63064972, 0x176e15c0, 0x8c155555, 0xa0549ad8, 0x8029ce24, 0x720cc19b, 0x13a0315f,
+      0x7bb49ab8, 0x3ebce282, 0x495dd27e, 0x977170fb, 0x66c24c78, 0x44ad2277, 0xa1775ff6, 0x0005a60f};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x00000004, 0x28460000, 0xc0000004, 0xcce34953, 0xb86f6001, 0x2b689a3c, 0xd7c70269, 0xaf2fef48,
+      0x618e3275, 0xc60c92e5, 0x2edc2b80, 0x182aaaaa, 0x40a935b1, 0x00539c49, 0xe4198337, 0x274062be,
+      0xf7693570, 0x7d79c504, 0x92bba4fc, 0x2ee2e1f6, 0xcd8498f1, 0x895a44ee, 0x42eebfec, 0x000b4c1f};
    static constexpr unsigned modulus_bit_count = 377;
-    static constexpr storage<limbs_count> m = {0x5e4daffc, 0x1f9fd58c, 0x89c42a59, 0xd0ed6877, 0xd85a6d02, 0x6af2d488, 0x6776b1a0, 0x3bbad0de, 0x582ef4f7, 0x976c3ca0, 0x0cc4060e, 0x0261508d};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xffffff, 0xf73fffff, 0xffffff7a, 0xf4a2bbcf, 0xf6b7ffe8, 0x0c9dd045, 0x0aec70e1, 0xdd260cff, 0x5eb6c4e5, 0xc4fa3f93, 0x3aef1539, 0x51c5b9e8};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x934f3a1, 0xb0909a28, 0xc1cfac62, 0x3264aa55, 0x2a491ae8, 0xaccd49ca, 0xe80e9a61, 0x28b2dce9, 0x26f7c08a, 0x4d313ea1, 0x36254563, 0x161de1ee};
+    static constexpr storage<limbs_count> m = {0x5e4daffc, 0x1f9fd58c, 0x89c42a59, 0xd0ed6877, 0xd85a6d02, 0x6af2d488,
+                                               0x6776b1a0, 0x3bbad0de, 0x582ef4f7, 0x976c3ca0, 0x0cc4060e, 0x0261508d};
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0xffffff,   0xf73fffff, 0xffffff7a, 0xf4a2bbcf,
+                                                          0xf6b7ffe8, 0x0c9dd045, 0x0aec70e1, 0xdd260cff,
+                                                          0x5eb6c4e5, 0xc4fa3f93, 0x3aef1539, 0x51c5b9e8};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x934f3a1,  0xb0909a28, 0xc1cfac62, 0x3264aa55,
+                                                              0x2a491ae8, 0xaccd49ca, 0xe80e9a61, 0x28b2dce9,
+                                                              0x26f7c08a, 0x4d313ea1, 0x36254563, 0x161de1ee};
    // i^2, the square of the imaginary unit for the extension field
    static constexpr uint32_t i_squared = 5;
    // true if i^2 is negative
    static constexpr bool i_squared_is_negative = true;
-    // G1 and G2 generators 
-    static constexpr storage<limbs_count> g1_gen_x = {0xb21be9ef, 0xeab9b16e, 0xffcd394e, 0xd5481512, 0xbd37cb5c, 0x188282c8,
-                                                         0xaa9d41bb, 0x85951e2c, 0xbf87ff54, 0xc8fc6225, 0xfe740a67, 0x008848de};
-    static constexpr storage<limbs_count> g1_gen_y = {0x559c8ea6, 0xfd82de55, 0x34a9591a, 0xc2fe3d36, 0x4fb82305, 0x6d182ad4,
-                                                         0xca3e52d9, 0xbd7fb348, 0x30afeec4, 0x1f674f5d, 0xc5102eff, 0x01914a69};
-    static constexpr storage<limbs_count> g2_gen_x_re = {0x7c005196, 0x74e3e48f, 0xbb535402, 0x71889f52, 0x57db6b9b, 0x7ea501f5, 
-                                                            0x203e5031, 0xc565f071, 0xa3841d01, 0xc89630a2, 0x71c785fe, 0x018480be};
-    static constexpr storage<limbs_count> g2_gen_x_im = {0x6ea16afe, 0xb26bfefa, 0xbff76fe6, 0x5cf89984, 0x0799c9de, 0xe7223ece, 
-                                                            0x6651cecb, 0x532777ee, 0xb1b140d5, 0x70dc5a51, 0xe7004031, 0x00ea6040};
-    static constexpr storage<limbs_count> g2_gen_y_re = {0x09fd4ddf, 0xf0940944, 0x6d8c7c2e, 0xf2cf8888, 0xf832d204, 0xe458c282, 
-                                                            0x74b49a58, 0xde03ed72, 0xcbb2efb4, 0xd960736b, 0x5d446f7b, 0x00690d66};
-    static constexpr storage<limbs_count> g2_gen_y_im = {0x85eb8f93, 0xd9a1cdd1, 0x5e52270b, 0x4279b83f, 0xcee304c2, 0x2463b01a,
-                                                            0x3d591bf1, 0x61ef11ac, 0x151a70aa, 0x9e549da3, 0xd2835518, 0x00f8169f};
+    // G1 and G2 generators
+    static constexpr storage<limbs_count> g1_gen_x = {0xb21be9ef, 0xeab9b16e, 0xffcd394e, 0xd5481512,
+                                                      0xbd37cb5c, 0x188282c8, 0xaa9d41bb, 0x85951e2c,
+                                                      0xbf87ff54, 0xc8fc6225, 0xfe740a67, 0x008848de};
+    static constexpr storage<limbs_count> g1_gen_y = {0x559c8ea6, 0xfd82de55, 0x34a9591a, 0xc2fe3d36,
+                                                      0x4fb82305, 0x6d182ad4, 0xca3e52d9, 0xbd7fb348,
+                                                      0x30afeec4, 0x1f674f5d, 0xc5102eff, 0x01914a69};
+    static constexpr storage<limbs_count> g2_gen_x_re = {0x7c005196, 0x74e3e48f, 0xbb535402, 0x71889f52,
+                                                         0x57db6b9b, 0x7ea501f5, 0x203e5031, 0xc565f071,
+                                                         0xa3841d01, 0xc89630a2, 0x71c785fe, 0x018480be};
+    static constexpr storage<limbs_count> g2_gen_x_im = {0x6ea16afe, 0xb26bfefa, 0xbff76fe6, 0x5cf89984,
+                                                         0x0799c9de, 0xe7223ece, 0x6651cecb, 0x532777ee,
+                                                         0xb1b140d5, 0x70dc5a51, 0xe7004031, 0x00ea6040};
+    static constexpr storage<limbs_count> g2_gen_y_re = {0x09fd4ddf, 0xf0940944, 0x6d8c7c2e, 0xf2cf8888,
+                                                         0xf832d204, 0xe458c282, 0x74b49a58, 0xde03ed72,
+                                                         0xcbb2efb4, 0xd960736b, 0x5d446f7b, 0x00690d66};
+    static constexpr storage<limbs_count> g2_gen_y_im = {0x85eb8f93, 0xd9a1cdd1, 0x5e52270b, 0x4279b83f,
+                                                         0xcee304c2, 0x2463b01a, 0x3d591bf1, 0x61ef11ac,
+                                                         0x151a70aa, 0x9e549da3, 0xd2835518, 0x00f8169f};
  };

-  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                                          0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {0x9999999a, 0x1c9ed999, 0x1ccccccd, 0x0dd39e5c, 0x3c6bf800, 0x129207b6,
-                                                                          0xcd5fd889, 0xdc7b4f91, 0x7460c589, 0x43bd0373, 0xdb0fd6f3, 0x010222f6};
-}
+  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
+    0x9999999a, 0x1c9ed999, 0x1ccccccd, 0x0dd39e5c, 0x3c6bf800, 0x129207b6,
+    0xcd5fd889, 0xdc7b4f91, 0x7460c589, 0x43bd0373, 0xdb0fd6f3, 0x010222f6};
+} // namespace PARAMS_BLS12_377
--- a/icicle/curves/bls12_377/projective.cu
+++ b/icicle/curves/bls12_377/projective.cu
@@ -1,50 +1,45 @@
-#include <cuda.h>
-#include "curve_config.cuh"
 #include "../../primitives/projective.cuh"
+#include "curve_config.cuh"
+#include <cuda.h>

-extern "C" BLS12_377::projective_t random_projective_bls12_377()
-{
-  return BLS12_377::projective_t::rand_host();
-}
+extern "C" BLS12_377::projective_t random_projective_bls12_377() { return BLS12_377::projective_t::rand_host(); }

-extern "C" BLS12_377::projective_t projective_zero_bls12_377()
-{
-  return BLS12_377::projective_t::zero();
-}
+extern "C" BLS12_377::projective_t projective_zero_bls12_377() { return BLS12_377::projective_t::zero(); }

-extern "C" bool projective_is_on_curve_bls12_377(BLS12_377::projective_t *point1)
+extern "C" bool projective_is_on_curve_bls12_377(BLS12_377::projective_t* point1)
 {
  return BLS12_377::projective_t::is_on_curve(*point1);
 }

-extern "C" BLS12_377::affine_t projective_to_affine_bls12_377(BLS12_377::projective_t *point1)
+extern "C" BLS12_377::affine_t projective_to_affine_bls12_377(BLS12_377::projective_t* point1)
 {
  return BLS12_377::projective_t::to_affine(*point1);
 }

-extern "C" BLS12_377::projective_t projective_from_affine_bls12_377(BLS12_377::affine_t *point1)
+extern "C" BLS12_377::projective_t projective_from_affine_bls12_377(BLS12_377::affine_t* point1)
 {
  return BLS12_377::projective_t::from_affine(*point1);
 }

-extern "C" BLS12_377::scalar_field_t random_scalar_bls12_377()
-{
-  return BLS12_377::scalar_field_t::rand_host();
-}
+extern "C" BLS12_377::scalar_field_t random_scalar_bls12_377() { return BLS12_377::scalar_field_t::rand_host(); }

-extern "C" bool eq_bls12_377(BLS12_377::projective_t *point1, BLS12_377::projective_t *point2)
+extern "C" bool eq_bls12_377(BLS12_377::projective_t* point1, BLS12_377::projective_t* point2)
 {
-  return (*point1 == *point2) && 
-  !((point1->x == BLS12_377::point_field_t::zero()) && (point1->y == BLS12_377::point_field_t::zero()) && (point1->z == BLS12_377::point_field_t::zero())) && 
-  !((point2->x == BLS12_377::point_field_t::zero()) && (point2->y == BLS12_377::point_field_t::zero()) && (point2->z == BLS12_377::point_field_t::zero()));
+  return (*point1 == *point2) &&
+         !((point1->x == BLS12_377::point_field_t::zero()) && (point1->y == BLS12_377::point_field_t::zero()) &&
+           (point1->z == BLS12_377::point_field_t::zero())) &&
+         !((point2->x == BLS12_377::point_field_t::zero()) && (point2->y == BLS12_377::point_field_t::zero()) &&
+           (point2->z == BLS12_377::point_field_t::zero()));
 }

 #if defined(G2_DEFINED)
-extern "C" bool eq_g2_bls12_377(BLS12_377::g2_projective_t *point1, BLS12_377::g2_projective_t *point2)
+extern "C" bool eq_g2_bls12_377(BLS12_377::g2_projective_t* point1, BLS12_377::g2_projective_t* point2)
 {
-  return (*point1 == *point2) && 
-  !((point1->x == BLS12_377::g2_point_field_t::zero()) && (point1->y == BLS12_377::g2_point_field_t::zero()) && (point1->z == BLS12_377::g2_point_field_t::zero())) && 
-  !((point2->x == BLS12_377::g2_point_field_t::zero()) && (point2->y == BLS12_377::g2_point_field_t::zero()) && (point2->z == BLS12_377::g2_point_field_t::zero()));
+  return (*point1 == *point2) &&
+         !((point1->x == BLS12_377::g2_point_field_t::zero()) && (point1->y == BLS12_377::g2_point_field_t::zero()) &&
+           (point1->z == BLS12_377::g2_point_field_t::zero())) &&
+         !((point2->x == BLS12_377::g2_point_field_t::zero()) && (point2->y == BLS12_377::g2_point_field_t::zero()) &&
+           (point2->z == BLS12_377::g2_point_field_t::zero()));
 }

 extern "C" BLS12_377::g2_projective_t random_g2_projective_bls12_377()
@@ -52,17 +47,17 @@ extern "C" BLS12_377::g2_projective_t random_g2_projective_bls12_377()
  return BLS12_377::g2_projective_t::rand_host();
 }

-extern "C" BLS12_377::g2_affine_t g2_projective_to_affine_bls12_377(BLS12_377::g2_projective_t *point1)
+extern "C" BLS12_377::g2_affine_t g2_projective_to_affine_bls12_377(BLS12_377::g2_projective_t* point1)
 {
  return BLS12_377::g2_projective_t::to_affine(*point1);
 }

-extern "C" BLS12_377::g2_projective_t g2_projective_from_affine_bls12_377(BLS12_377::g2_affine_t *point1)
+extern "C" BLS12_377::g2_projective_t g2_projective_from_affine_bls12_377(BLS12_377::g2_affine_t* point1)
 {
  return BLS12_377::g2_projective_t::from_affine(*point1);
 }

-extern "C" bool g2_projective_is_on_curve_bls12_377(BLS12_377::g2_projective_t *point1)
+extern "C" bool g2_projective_is_on_curve_bls12_377(BLS12_377::g2_projective_t* point1)
 {
  return BLS12_377::g2_projective_t::is_on_curve(*point1);
 }
--- a/icicle/curves/bls12_377/supported_operations.cu
+++ b/icicle/curves/bls12_377/supported_operations.cu
@@ -1,4 +1,4 @@
-#include "projective.cu"
 #include "lde.cu"
 #include "msm.cu"
+#include "projective.cu"
 #include "ve_mod_mult.cu"
--- a/icicle/curves/bls12_377/ve_mod_mult.cu
+++ b/icicle/curves/bls12_377/ve_mod_mult.cu
@@ -1,88 +1,78 @@
 #ifndef _BLS12_377_VEC_MULT
 #define _BLS12_377_VEC_MULT
-#include <stdio.h>
-#include <iostream>
-#include "../../primitives/field.cuh"
-#include "../../utils/storage.cuh"
-#include "../../primitives/projective.cuh"
-#include "curve_config.cuh"
 #include "../../appUtils/vector_manipulation/ve_mod_mult.cuh"
+#include "../../primitives/field.cuh"
+#include "../../primitives/projective.cuh"
+#include "../../utils/storage.cuh"
+#include "curve_config.cuh"
+#include <iostream>
+#include <stdio.h>

-
-extern "C" int32_t vec_mod_mult_point_bls12_377(BLS12_377::projective_t *inout,
-                                      BLS12_377::scalar_t *scalar_vec,
-                                      size_t n_elments,
-                                      size_t device_id,
-                                      cudaStream_t stream = 0)
+extern "C" int32_t vec_mod_mult_point_bls12_377(
+  BLS12_377::projective_t* inout,
+  BLS12_377::scalar_t* scalar_vec,
+  size_t n_elments,
+  size_t device_id,
+  cudaStream_t stream = 0)
 {
  // TODO: use device_id when working with multiple devices
  (void)device_id;
-  try
-  {
+  try {
    // TODO: device_id
    vector_mod_mult<BLS12_377::projective_t, BLS12_377::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
    return CUDA_SUCCESS;
-  }
-  catch (const std::runtime_error &ex)
-  {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
 }

-extern "C" int32_t vec_mod_mult_scalar_bls12_377(BLS12_377::scalar_t *inout,
-                                       BLS12_377::scalar_t *scalar_vec,
-                                       size_t n_elments,
-                                       size_t device_id,
-                                       cudaStream_t stream = 0)
+extern "C" int32_t vec_mod_mult_scalar_bls12_377(
+  BLS12_377::scalar_t* inout,
+  BLS12_377::scalar_t* scalar_vec,
+  size_t n_elments,
+  size_t device_id,
+  cudaStream_t stream = 0)
 {
  // TODO: use device_id when working with multiple devices
  (void)device_id;
-  try
-  {
+  try {
    // TODO: device_id
    vector_mod_mult<BLS12_377::scalar_t, BLS12_377::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
    return CUDA_SUCCESS;
-  }
-  catch (const std::runtime_error &ex)
-  {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
 }

 extern "C" int32_t vec_mod_mult_device_scalar_bls12_377(
-    BLS12_377::scalar_t *inout,
-    BLS12_377::scalar_t *scalar_vec,
-    size_t n_elements,
-    size_t device_id
-) {
+  BLS12_377::scalar_t* inout, BLS12_377::scalar_t* scalar_vec, size_t n_elements, size_t device_id)
+{
  try {
    vector_mod_mult_device<BLS12_377::scalar_t, BLS12_377::scalar_t>(scalar_vec, inout, inout, n_elements);
    return CUDA_SUCCESS;
-  } catch (const std::runtime_error &ex) {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
 }

-extern "C" int32_t matrix_vec_mod_mult_bls12_377(BLS12_377::scalar_t *matrix_flattened,
-                                       BLS12_377::scalar_t *input,
-                                       BLS12_377::scalar_t *output,
-                                       size_t n_elments,
-                                       size_t device_id,
-                                       cudaStream_t stream = 0)
+extern "C" int32_t matrix_vec_mod_mult_bls12_377(
+  BLS12_377::scalar_t* matrix_flattened,
+  BLS12_377::scalar_t* input,
+  BLS12_377::scalar_t* output,
+  size_t n_elments,
+  size_t device_id,
+  cudaStream_t stream = 0)
 {
  // TODO: use device_id when working with multiple devices
  (void)device_id;
-  try
-  {
+  try {
    // TODO: device_id
    matrix_mod_mult<BLS12_377::scalar_t>(matrix_flattened, input, output, n_elments, stream);
    return CUDA_SUCCESS;
-  }
-  catch (const std::runtime_error &ex)
-  {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
--- a/icicle/curves/bls12_381/curve_config.cuh
+++ b/icicle/curves/bls12_381/curve_config.cuh
@@ -9,17 +9,17 @@
 #include "params.cuh"

 namespace BLS12_381 {
-    typedef Field<PARAMS_BLS12_381::fp_config> scalar_field_t;
-    typedef scalar_field_t scalar_t;
-    typedef Field<PARAMS_BLS12_381::fq_config> point_field_t;
-    static constexpr point_field_t b = point_field_t{ PARAMS_BLS12_381::weierstrass_b };
-    typedef Projective<point_field_t, scalar_field_t, b> projective_t;
-    typedef Affine<point_field_t> affine_t;
-    #if defined(G2_DEFINED)
-    typedef ExtensionField<PARAMS_BLS12_381::fq_config> g2_point_field_t;
-    static constexpr g2_point_field_t b_g2 = g2_point_field_t{ point_field_t{ PARAMS_BLS12_381::weierstrass_b_g2_re },
-                                                               point_field_t{ PARAMS_BLS12_381::weierstrass_b_g2_im }};
-    typedef Projective<g2_point_field_t, scalar_field_t, b_g2> g2_projective_t;
-    typedef Affine<g2_point_field_t> g2_affine_t;
-    #endif
-}
+  typedef Field<PARAMS_BLS12_381::fp_config> scalar_field_t;
+  typedef scalar_field_t scalar_t;
+  typedef Field<PARAMS_BLS12_381::fq_config> point_field_t;
+  static constexpr point_field_t b = point_field_t{PARAMS_BLS12_381::weierstrass_b};
+  typedef Projective<point_field_t, scalar_field_t, b> projective_t;
+  typedef Affine<point_field_t> affine_t;
+#if defined(G2_DEFINED)
+  typedef ExtensionField<PARAMS_BLS12_381::fq_config> g2_point_field_t;
+  static constexpr g2_point_field_t b_g2 = g2_point_field_t{
+    point_field_t{PARAMS_BLS12_381::weierstrass_b_g2_re}, point_field_t{PARAMS_BLS12_381::weierstrass_b_g2_im}};
+  typedef Projective<g2_point_field_t, scalar_field_t, b_g2> g2_projective_t;
+  typedef Affine<g2_point_field_t> g2_affine_t;
+#endif
+} // namespace BLS12_381
--- a/icicle/curves/bls12_381/lde.cu
+++ b/icicle/curves/bls12_381/lde.cu
@@ -1,523 +1,560 @@
 #ifndef _BLS12_381_LDE
 #define _BLS12_381_LDE
-#include <cuda.h>
 #include "../../appUtils/ntt/lde.cu"
 #include "../../appUtils/ntt/ntt.cuh"
 #include "../../appUtils/vector_manipulation/ve_mod_mult.cuh"
-#include "curve_config.cuh"
 #include "../../utils/mont.cuh"
+#include "curve_config.cuh"
+#include <cuda.h>

-extern "C" BLS12_381::scalar_t* build_domain_cuda_bls12_381(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" BLS12_381::scalar_t* build_domain_cuda_bls12_381(
+  uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        if (inverse) {
-            return fill_twiddle_factors_array(domain_size, BLS12_381::scalar_t::omega_inv(logn), stream);
-        } else {
-            return fill_twiddle_factors_array(domain_size, BLS12_381::scalar_t::omega(logn), stream);
-        }
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return nullptr;
+  try {
+    cudaStreamCreate(&stream);
+    if (inverse) {
+      return fill_twiddle_factors_array(domain_size, BLS12_381::scalar_t::omega_inv(logn), stream);
+    } else {
+      return fill_twiddle_factors_array(domain_size, BLS12_381::scalar_t::omega(logn), stream);
    }
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return nullptr;
+  }
 }

-extern "C" int ntt_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int ntt_cuda_bls12_381(
+  BLS12_381::scalar_t* arr,
+  uint32_t n,
+  bool inverse,
+  Decimation decimation,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return ntt_end2end_template<BLS12_381::scalar_t,BLS12_381::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        
-        return -1;        
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return ntt_end2end_template<BLS12_381::scalar_t, BLS12_381::scalar_t>(
+      arr, n, inverse, stream); // TODO: pass device_id
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+
+    return -1;
+  }
 }

-extern "C" int ecntt_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int ecntt_cuda_bls12_381(
+  BLS12_381::projective_t* arr,
+  uint32_t n,
+  bool inverse,
+  Decimation decimation,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return ntt_end2end_template<BLS12_381::projective_t,BLS12_381::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return ntt_end2end_template<BLS12_381::projective_t, BLS12_381::scalar_t>(
+      arr, n, inverse, stream); // TODO: pass device_id
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int ntt_batch_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int ntt_batch_cuda_bls12_381(
+  BLS12_381::scalar_t* arr,
+  uint32_t arr_size,
+  uint32_t batch_size,
+  bool inverse,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return ntt_end2end_batch_template<BLS12_381::scalar_t,BLS12_381::scalar_t>(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return ntt_end2end_batch_template<BLS12_381::scalar_t, BLS12_381::scalar_t>(
+      arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int ecntt_batch_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int ecntt_batch_cuda_bls12_381(
+  BLS12_381::projective_t* arr,
+  uint32_t arr_size,
+  uint32_t batch_size,
+  bool inverse,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return ntt_end2end_batch_template<BLS12_381::projective_t,BLS12_381::scalar_t>(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return ntt_end2end_batch_template<BLS12_381::projective_t, BLS12_381::scalar_t>(
+      arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int interpolate_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t *d_evaluations, BLS12_381::scalar_t *d_domain, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0)
+extern "C" int interpolate_scalars_cuda_bls12_381(
+  BLS12_381::scalar_t* d_out,
+  BLS12_381::scalar_t* d_evaluations,
+  BLS12_381::scalar_t* d_domain,
+  unsigned n,
+  unsigned device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        BLS12_381::scalar_t* _null = nullptr;
-        return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    BLS12_381::scalar_t* _null = nullptr;
+    return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int interpolate_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t* d_evaluations, BLS12_381::scalar_t* d_domain, unsigned n,
-                                              unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int interpolate_scalars_batch_cuda_bls12_381(
+  BLS12_381::scalar_t* d_out,
+  BLS12_381::scalar_t* d_evaluations,
+  BLS12_381::scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        BLS12_381::scalar_t* _null = nullptr;
-        cudaStreamCreate(&stream);
-        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    BLS12_381::scalar_t* _null = nullptr;
+    cudaStreamCreate(&stream);
+    return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int interpolate_points_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t *d_evaluations, BLS12_381::scalar_t *d_domain, unsigned n, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int interpolate_points_cuda_bls12_381(
+  BLS12_381::projective_t* d_out,
+  BLS12_381::projective_t* d_evaluations,
+  BLS12_381::scalar_t* d_domain,
+  unsigned n,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        BLS12_381::scalar_t* _null = nullptr;
-        return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    BLS12_381::scalar_t* _null = nullptr;
+    return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int interpolate_points_batch_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t* d_evaluations, BLS12_381::scalar_t* d_domain,
-                                             unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int interpolate_points_batch_cuda_bls12_381(
+  BLS12_381::projective_t* d_out,
+  BLS12_381::projective_t* d_evaluations,
+  BLS12_381::scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        BLS12_381::scalar_t* _null = nullptr;
-        cudaStreamCreate(&stream);
-        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    BLS12_381::scalar_t* _null = nullptr;
+    cudaStreamCreate(&stream);
+    return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int evaluate_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t *d_coefficients, BLS12_381::scalar_t *d_domain, 
-                                     unsigned domain_size, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0)
+extern "C" int evaluate_scalars_cuda_bls12_381(
+  BLS12_381::scalar_t* d_out,
+  BLS12_381::scalar_t* d_coefficients,
+  BLS12_381::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        BLS12_381::scalar_t* _null = nullptr;
-        cudaStreamCreate(&stream);
-        return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    BLS12_381::scalar_t* _null = nullptr;
+    cudaStreamCreate(&stream);
+    return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int evaluate_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t* d_coefficients, BLS12_381::scalar_t* d_domain, unsigned domain_size,
-                                           unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int evaluate_scalars_batch_cuda_bls12_381(
+  BLS12_381::scalar_t* d_out,
+  BLS12_381::scalar_t* d_coefficients,
+  BLS12_381::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        BLS12_381::scalar_t* _null = nullptr;
-        cudaStreamCreate(&stream);
-        auto result_code = evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, 0);
-        cudaStreamDestroy(stream);
-        return result_code;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    BLS12_381::scalar_t* _null = nullptr;
+    cudaStreamCreate(&stream);
+    auto result_code = evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, 0);
+    cudaStreamDestroy(stream);
+    return result_code;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int evaluate_points_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t *d_coefficients, BLS12_381::scalar_t *d_domain, 
-                                    unsigned domain_size, unsigned n, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int evaluate_points_cuda_bls12_381(
+  BLS12_381::projective_t* d_out,
+  BLS12_381::projective_t* d_coefficients,
+  BLS12_381::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        BLS12_381::scalar_t* _null = nullptr;
-        cudaStreamCreate(&stream);
-        return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    BLS12_381::scalar_t* _null = nullptr;
+    cudaStreamCreate(&stream);
+    return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int evaluate_points_batch_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t* d_coefficients, BLS12_381::scalar_t* d_domain, unsigned domain_size,
-                                          unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int evaluate_points_batch_cuda_bls12_381(
+  BLS12_381::projective_t* d_out,
+  BLS12_381::projective_t* d_coefficients,
+  BLS12_381::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        BLS12_381::scalar_t* _null = nullptr;
-        cudaStreamCreate(&stream);
-        auto result_code = evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
-        cudaStreamDestroy(stream);
-        return result_code;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    BLS12_381::scalar_t* _null = nullptr;
+    cudaStreamCreate(&stream);
+    auto result_code =
+      evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
+    cudaStreamDestroy(stream);
+    return result_code;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int evaluate_scalars_on_coset_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t *d_coefficients, BLS12_381::scalar_t *d_domain, unsigned domain_size,
-                                              unsigned n, BLS12_381::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0)
+extern "C" int evaluate_scalars_on_coset_cuda_bls12_381(
+  BLS12_381::scalar_t* d_out,
+  BLS12_381::scalar_t* d_coefficients,
+  BLS12_381::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BLS12_381::scalar_t* coset_powers,
+  unsigned device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int evaluate_scalars_on_coset_batch_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t* d_coefficients, BLS12_381::scalar_t* d_domain, unsigned domain_size, 
-                                                    unsigned n, unsigned batch_size, BLS12_381::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int evaluate_scalars_on_coset_batch_cuda_bls12_381(
+  BLS12_381::scalar_t* d_out,
+  BLS12_381::scalar_t* d_coefficients,
+  BLS12_381::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_381::scalar_t* coset_powers,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int evaluate_points_on_coset_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t *d_coefficients, BLS12_381::scalar_t *d_domain, unsigned domain_size,
-                                             unsigned n, BLS12_381::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int evaluate_points_on_coset_cuda_bls12_381(
+  BLS12_381::projective_t* d_out,
+  BLS12_381::projective_t* d_coefficients,
+  BLS12_381::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BLS12_381::scalar_t* coset_powers,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream); //TODO: don't create if default was passed, destroy what was created, same applies to all calls
-        return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(
+      &stream); // TODO: don't create if default was passed, destroy what was created, same applies to all calls
+    return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int evaluate_points_on_coset_batch_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t* d_coefficients, BLS12_381::scalar_t* d_domain, unsigned domain_size, 
-                                                   unsigned n, unsigned batch_size, BLS12_381::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int evaluate_points_on_coset_batch_cuda_bls12_381(
+  BLS12_381::projective_t* d_out,
+  BLS12_381::projective_t* d_coefficients,
+  BLS12_381::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_381::scalar_t* coset_powers,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int ntt_inplace_batch_cuda_bls12_381(BLS12_381::scalar_t* d_inout, BLS12_381::scalar_t* d_twiddles,
-                                           unsigned n, unsigned batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int ntt_inplace_batch_cuda_bls12_381(
+  BLS12_381::scalar_t* d_inout,
+  BLS12_381::scalar_t* d_twiddles,
+  unsigned n,
+  unsigned batch_size,
+  bool inverse,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        BLS12_381::scalar_t* _null = nullptr;
-        ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, false, _null, stream, true);
-        return CUDA_SUCCESS; //TODO: we should implement this https://leimao.github.io/blog/Proper-CUDA-Error-Checking/
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    BLS12_381::scalar_t* _null = nullptr;
+    ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, false, _null, stream, true);
+    return CUDA_SUCCESS; // TODO: we should implement this https://leimao.github.io/blog/Proper-CUDA-Error-Checking/
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int reverse_order_scalars_cuda_bls12_381(BLS12_381::scalar_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int
+reverse_order_scalars_cuda_bls12_381(BLS12_381::scalar_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
 {
-    try
-    {
-        uint32_t logn = uint32_t(log(n) / log(2));
-        cudaStreamCreate(&stream);
-        reverse_order(arr, n, logn, stream);
-        return 0;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    uint32_t logn = uint32_t(log(n) / log(2));
+    cudaStreamCreate(&stream);
+    reverse_order(arr, n, logn, stream);
+    return 0;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int reverse_order_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int reverse_order_scalars_batch_cuda_bls12_381(
+  BLS12_381::scalar_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
 {
-    try
-    {
-        uint32_t logn = uint32_t(log(n) / log(2));
-        cudaStreamCreate(&stream);
-        reverse_order_batch(arr, n, logn, batch_size, stream);
-        return 0;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    uint32_t logn = uint32_t(log(n) / log(2));
+    cudaStreamCreate(&stream);
+    reverse_order_batch(arr, n, logn, batch_size, stream);
+    return 0;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int reverse_order_points_cuda_bls12_381(BLS12_381::projective_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int
+reverse_order_points_cuda_bls12_381(BLS12_381::projective_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
 {
-    try
-    {
-        uint32_t logn = uint32_t(log(n) / log(2));
-        cudaStreamCreate(&stream);
-        reverse_order(arr, n, logn, stream);
-        return 0;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    uint32_t logn = uint32_t(log(n) / log(2));
+    cudaStreamCreate(&stream);
+    reverse_order(arr, n, logn, stream);
+    return 0;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int sub_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t* d_in1, BLS12_381::scalar_t* d_in2, unsigned n, cudaStream_t stream = 0)
+extern "C" int sub_scalars_cuda_bls12_381(
+  BLS12_381::scalar_t* d_out,
+  BLS12_381::scalar_t* d_in1,
+  BLS12_381::scalar_t* d_in2,
+  unsigned n,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return sub_polys(d_out, d_in1, d_in2, n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return sub_polys(d_out, d_in1, d_in2, n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int add_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t* d_in1, BLS12_381::scalar_t* d_in2, unsigned n, cudaStream_t stream = 0)
+extern "C" int add_scalars_cuda_bls12_381(
+  BLS12_381::scalar_t* d_out,
+  BLS12_381::scalar_t* d_in1,
+  BLS12_381::scalar_t* d_in2,
+  unsigned n,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return add_polys(d_out, d_in1, d_in2, n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return add_polys(d_out, d_in1, d_in2, n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

 extern "C" int to_montgomery_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_inout, unsigned n, cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return to_montgomery(d_inout, n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return to_montgomery(d_inout, n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

 extern "C" int from_montgomery_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_inout, unsigned n, cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return from_montgomery(d_inout, n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return from_montgomery(d_inout, n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int to_montgomery_proj_points_cuda_bls12_381(BLS12_381::projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
+extern "C" int
+to_montgomery_proj_points_cuda_bls12_381(BLS12_381::projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return to_montgomery((BLS12_381::point_field_t*)d_inout, 3 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return to_montgomery((BLS12_381::point_field_t*)d_inout, 3 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int from_montgomery_proj_points_cuda_bls12_381(BLS12_381::projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
+extern "C" int
+from_montgomery_proj_points_cuda_bls12_381(BLS12_381::projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return from_montgomery((BLS12_381::point_field_t*)d_inout, 3 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return from_montgomery((BLS12_381::point_field_t*)d_inout, 3 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int to_montgomery_aff_points_cuda_bls12_381(BLS12_381::affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
+extern "C" int
+to_montgomery_aff_points_cuda_bls12_381(BLS12_381::affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return to_montgomery((BLS12_381::point_field_t*)d_inout, 2 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return to_montgomery((BLS12_381::point_field_t*)d_inout, 2 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int from_montgomery_aff_points_cuda_bls12_381(BLS12_381::affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
+extern "C" int
+from_montgomery_aff_points_cuda_bls12_381(BLS12_381::affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return from_montgomery((BLS12_381::point_field_t*)d_inout, 2 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return from_montgomery((BLS12_381::point_field_t*)d_inout, 2 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

 #if defined(G2_DEFINED)
-extern "C" int to_montgomery_proj_points_g2_cuda_bls12_381(BLS12_381::g2_projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
+extern "C" int
+to_montgomery_proj_points_g2_cuda_bls12_381(BLS12_381::g2_projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return to_montgomery((BLS12_381::point_field_t*)d_inout, 6 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return to_montgomery((BLS12_381::point_field_t*)d_inout, 6 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int from_montgomery_proj_points_g2_cuda_bls12_381(BLS12_381::g2_projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
+extern "C" int
+from_montgomery_proj_points_g2_cuda_bls12_381(BLS12_381::g2_projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return from_montgomery((BLS12_381::point_field_t*)d_inout, 6 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return from_montgomery((BLS12_381::point_field_t*)d_inout, 6 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int to_montgomery_aff_points_g2_cuda_bls12_381(BLS12_381::g2_affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
+extern "C" int
+to_montgomery_aff_points_g2_cuda_bls12_381(BLS12_381::g2_affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return to_montgomery((BLS12_381::point_field_t*)d_inout, 4 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return to_montgomery((BLS12_381::point_field_t*)d_inout, 4 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int from_montgomery_aff_points_g2_cuda_bls12_381(BLS12_381::g2_affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
+extern "C" int
+from_montgomery_aff_points_g2_cuda_bls12_381(BLS12_381::g2_affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        return from_montgomery((BLS12_381::point_field_t*)d_inout, 4 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    return from_montgomery((BLS12_381::point_field_t*)d_inout, 4 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }
 #endif

-extern "C" int reverse_order_points_batch_cuda_bls12_381(BLS12_381::projective_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int reverse_order_points_batch_cuda_bls12_381(
+  BLS12_381::projective_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
 {
-    try
-    {
-        uint32_t logn = uint32_t(log(n) / log(2));
-        cudaStreamCreate(&stream);
-        reverse_order_batch(arr, n, logn, batch_size, stream);
-        return 0;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    uint32_t logn = uint32_t(log(n) / log(2));
+    cudaStreamCreate(&stream);
+    reverse_order_batch(arr, n, logn, batch_size, stream);
+    return 0;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }
 #endif
--- a/icicle/curves/bls12_381/msm.cu
+++ b/icicle/curves/bls12_381/msm.cu
@@ -1,41 +1,47 @@
 #ifndef _BLS12_381_MSM
 #define _BLS12_381_MSM
 #include "../../appUtils/msm/msm.cu"
-#include <stdexcept>
-#include <cuda.h>
 #include "curve_config.cuh"
+#include <cuda.h>
+#include <stdexcept>

-
-extern "C"
-int msm_cuda_bls12_381(BLS12_381::projective_t *out, BLS12_381::affine_t points[],
-              BLS12_381::scalar_t scalars[], size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0) //TODO: unify parameter types size_t/unsigned etc
+extern "C" int msm_cuda_bls12_381(
+  BLS12_381::projective_t* out,
+  BLS12_381::affine_t points[],
+  BLS12_381::scalar_t scalars[],
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0) // TODO: unify parameter types size_t/unsigned etc
 {
-    try
-    {   
-        cudaStreamCreate(&stream);
-        large_msm<BLS12_381::scalar_t, BLS12_381::projective_t, BLS12_381::affine_t>(scalars, points, count, out, false, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int msm_batch_cuda_bls12_381(BLS12_381::projective_t* out, BLS12_381::affine_t points[],
-                              BLS12_381::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0)
-{
-  try
-  {
+  try {
    cudaStreamCreate(&stream);
-    batched_large_msm<BLS12_381::scalar_t, BLS12_381::projective_t, BLS12_381::affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
+    large_msm<BLS12_381::scalar_t, BLS12_381::projective_t, BLS12_381::affine_t>(
+      scalars, points, count, out, false, false, large_bucket_factor, stream);
    cudaStreamSynchronize(stream);
    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
  }
-  catch (const std::runtime_error &ex)
-  {
+}
+
+extern "C" int msm_batch_cuda_bls12_381(
+  BLS12_381::projective_t* out,
+  BLS12_381::affine_t points[],
+  BLS12_381::scalar_t scalars[],
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm<BLS12_381::scalar_t, BLS12_381::projective_t, BLS12_381::affine_t>(
+      scalars, points, batch_size, msm_size, out, false, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what());
    return -1;
  }
@@ -43,144 +49,168 @@ extern "C" int msm_batch_cuda_bls12_381(BLS12_381::projective_t* out, BLS12_381:

 /**
 * Commit to a polynomial using the MSM.
- * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
 * @param d_out Ouptut point to write the result to.
 * @param d_scalars Scalars for the MSM. Must be on device.
 * @param d_points Points for the MSM. Must be on device.
 * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
 */
- extern "C"
- int commit_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::scalar_t* d_scalars, BLS12_381::affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
- {
-     try
-     {
-         cudaStreamCreate(&stream);
-         large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
-         cudaStreamSynchronize(stream);
-         return CUDA_SUCCESS;
-     }
-     catch (const std::runtime_error &ex)
-     {
-         printf("error %s", ex.what());
-         return -1;
-     }
- }
- 
- /**
-  * Commit to a batch of polynomials using the MSM.
-  * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
-  * @param d_out Ouptut point to write the results to.
-  * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
-  * @param d_points Points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM.
-  * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
-  * @param batch_size Size of the batch.
-  */
- extern "C"
- int commit_batch_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::scalar_t* d_scalars, BLS12_381::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
- {
-     try
-     {  
-        cudaStreamCreate(&stream);
-         batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
-         cudaStreamSynchronize(stream);
-         return CUDA_SUCCESS;
-     }
-     catch (const std::runtime_error &ex)
-     {
-         printf("error %s", ex.what());
-         return -1;
-     }
- }
-
-#if defined(G2_DEFINED)
-extern "C"
-int msm_g2_cuda_bls12_381(BLS12_381::g2_projective_t *out, BLS12_381::g2_affine_t points[],
-              BLS12_381::scalar_t scalars[], size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int commit_cuda_bls12_381(
+  BLS12_381::projective_t* d_out,
+  BLS12_381::scalar_t* d_scalars,
+  BLS12_381::affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {   
-        cudaStreamCreate(&stream);
-        large_msm<BLS12_381::scalar_t, BLS12_381::g2_projective_t, BLS12_381::g2_affine_t>(scalars, points, count, out, false, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int msm_batch_g2_cuda_bls12_381(BLS12_381::g2_projective_t* out, BLS12_381::g2_affine_t points[],
-                              BLS12_381::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0)
+/**
+ * Commit to a batch of polynomials using the MSM.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
+ * @param d_out Ouptut point to write the results to.
+ * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
+ * @param d_points Points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM.
+ * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
+ * @param batch_size Size of the batch.
+ */
+extern "C" int commit_batch_cuda_bls12_381(
+  BLS12_381::projective_t* d_out,
+  BLS12_381::scalar_t* d_scalars,
+  BLS12_381::affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        batched_large_msm<BLS12_381::scalar_t, BLS12_381::g2_projective_t, BLS12_381::g2_affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+#if defined(G2_DEFINED)
+extern "C" int msm_g2_cuda_bls12_381(
+  BLS12_381::g2_projective_t* out,
+  BLS12_381::g2_affine_t points[],
+  BLS12_381::scalar_t scalars[],
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    large_msm<BLS12_381::scalar_t, BLS12_381::g2_projective_t, BLS12_381::g2_affine_t>(
+      scalars, points, count, out, false, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int msm_batch_g2_cuda_bls12_381(
+  BLS12_381::g2_projective_t* out,
+  BLS12_381::g2_affine_t points[],
+  BLS12_381::scalar_t scalars[],
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm<BLS12_381::scalar_t, BLS12_381::g2_projective_t, BLS12_381::g2_affine_t>(
+      scalars, points, batch_size, msm_size, out, false, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

 /**
 * Commit to a polynomial using the MSM in G2 group.
- * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
 * @param d_out Ouptut G2 point to write the result to.
 * @param d_scalars Scalars for the MSM. Must be on device.
 * @param d_points G2 affine points for the MSM. Must be on device.
 * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
 */
-extern "C"
-int commit_g2_cuda_bls12_381(BLS12_381::g2_projective_t* d_out, BLS12_381::scalar_t* d_scalars, BLS12_381::g2_affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int commit_g2_cuda_bls12_381(
+  BLS12_381::g2_projective_t* d_out,
+  BLS12_381::scalar_t* d_scalars,
+  BLS12_381::g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    // TODO: use device_id when working with multiple devices
-    (void)device_id;
-    try
-    {
-        cudaStreamCreate(&stream);
-        large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  // TODO: use device_id when working with multiple devices
+  (void)device_id;
+  try {
+    cudaStreamCreate(&stream);
+    large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }
- 
- /**
-  * Commit to a batch of polynomials using the MSM.
-  * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
-  * @param d_out Ouptut G2 point to write the results to.
-  * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
-  * @param d_points G2 affine points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM.
-  * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
-  * @param batch_size Size of the batch.
-  */
-extern "C"
-int commit_batch_g2_cuda_bls12_381(BLS12_381::g2_projective_t* d_out, BLS12_381::scalar_t* d_scalars, BLS12_381::g2_affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+
+/**
+ * Commit to a batch of polynomials using the MSM.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
+ * @param d_out Ouptut G2 point to write the results to.
+ * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
+ * @param d_points G2 affine points for the MSMs. Must be on device. It is assumed that this set of bases is used for
+ * each MSM.
+ * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
+ * @param batch_size Size of the batch.
+ */
+extern "C" int commit_batch_g2_cuda_bls12_381(
+  BLS12_381::g2_projective_t* d_out,
+  BLS12_381::scalar_t* d_scalars,
+  BLS12_381::g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    // TODO: use device_id when working with multiple devices
-    (void)device_id;
-    try
-    {
-        cudaStreamCreate(&stream);
-        batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  // TODO: use device_id when working with multiple devices
+  (void)device_id;
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }
 #endif
 #endif
--- a/icicle/curves/bls12_381/params.cuh
+++ b/icicle/curves/bls12_381/params.cuh
@@ -1,219 +1,411 @@
 #pragma once
 #include "../../utils/storage.cuh"

-namespace PARAMS_BLS12_381{
+namespace PARAMS_BLS12_381 {
  struct fp_config {
    // field structure size = 8 * 32 bit
    static constexpr unsigned limbs_count = 8;
    static constexpr unsigned omegas_count = 32;
    // modulus = 52435875175126190479447740508185965837690552500527637822603658699938581184513
-    static constexpr storage<limbs_count> modulus = {0x00000001, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753};
+    static constexpr storage<limbs_count> modulus = {0x00000001, 0xffffffff, 0xfffe5bfe, 0x53bda402,
+                                                     0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753};
    // modulus*2 = 104871750350252380958895481016371931675381105001055275645207317399877162369026
-    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0xfffffffe, 0xfffcb7fd, 0xa77b4805, 0x1343b00a, 0x6673b010, 0x533afa90, 0xe7db4ea6};
-    static constexpr storage<limbs_count> modulus_4 = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  
-    static constexpr storage<2 * limbs_count> modulus_wide = {0x00000001, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753,
-                                                              0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0xfffffffe, 0xfffcb7fd, 0xa77b4805,
+                                                       0x1343b00a, 0x6673b010, 0x533afa90, 0xe7db4ea6};
+    static constexpr storage<limbs_count> modulus_4 = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                       0x00000000, 0x00000000, 0x00000000, 0x00000000};
+
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0x00000001, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
    // modulus^2
-    static constexpr storage<2*limbs_count> modulus_squared = {0x00000001, 0xfffffffe, 0xfffcb7fe, 0xa77e9007, 0x1cdbb005, 0x698ae002, 0x5433f7b8, 0x48aa415e, 
-                                                              0x4aa9c661, 0xc2611f6f, 0x59934a1d, 0x0e9593f9, 0xef2cc20f, 0x520c13db, 0xf4bc2778, 0x347f60f3};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0x00000001, 0xfffffffe, 0xfffcb7fe, 0xa77e9007, 0x1cdbb005, 0x698ae002, 0x5433f7b8, 0x48aa415e,
+      0x4aa9c661, 0xc2611f6f, 0x59934a1d, 0x0e9593f9, 0xef2cc20f, 0x520c13db, 0xf4bc2778, 0x347f60f3};
    // 2*modulus^2
-    static constexpr storage<2*limbs_count> modulus_squared_2 = {0x00000002, 0xfffffffc, 0xfff96ffd, 0x4efd200f, 0x39b7600b, 0xd315c004, 0xa867ef70, 0x915482bc, 
-                                                                0x95538cc2, 0x84c23ede, 0xb326943b, 0x1d2b27f2, 0xde59841e, 0xa41827b7, 0xe9784ef0, 0x68fec1e7};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0x00000002, 0xfffffffc, 0xfff96ffd, 0x4efd200f, 0x39b7600b, 0xd315c004, 0xa867ef70, 0x915482bc,
+      0x95538cc2, 0x84c23ede, 0xb326943b, 0x1d2b27f2, 0xde59841e, 0xa41827b7, 0xe9784ef0, 0x68fec1e7};
    // note: doesnt actually fit into 384 bits, and shouldnt be used! is added for compilation
-    static constexpr storage<2*limbs_count> modulus_squared_4 = {0x00000002, 0xfffffffc, 0xfff96ffd, 0x4efd200f, 0x39b7600b, 0xd315c004, 0xa867ef70, 0x915482bc, 
-                                                                0x95538cc2, 0x84c23ede, 0xb326943b, 0x1d2b27f2, 0xde59841e, 0xa41827b7, 0xe9784ef0, 0x68fec1e7};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x00000002, 0xfffffffc, 0xfff96ffd, 0x4efd200f, 0x39b7600b, 0xd315c004, 0xa867ef70, 0x915482bc,
+      0x95538cc2, 0x84c23ede, 0xb326943b, 0x1d2b27f2, 0xde59841e, 0xa41827b7, 0xe9784ef0, 0x68fec1e7};
    static constexpr unsigned modulus_bit_count = 255;
    // m = floor(2^(2*modulus_bit_count) / modulus)
-    static constexpr storage<limbs_count> m = {0x830358e4, 0x509cde80, 0x2f92eb5c, 0xd9410fad, 0xc1f823b4, 0xe2d772d, 0x7fb78ddf, 0x8d54253b};
-  
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xfffffffe, 0x00000001, 0x00034802, 0x5884b7fa, 0xecbc4ff5, 0x998c4fef, 0xacc5056f, 0x1824b159};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0xfe75c040, 0x13f75b69, 0x09dc705f, 0xab6fca8f, 0x4f77266a, 0x7204078a, 0x30009d57, 0x1bbe8693};
-  
-    // static constexpr storage<limbs_count> omega[32]= { {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753}, {0x00000000, 0x00010000, 0x76030000, 0xec030002, 0x760304d0, 0x8d51ccce, 0x00000000, 0x00000000}, {0x688bc087, 0x8dd702cb, 0x78eaa4fe, 0xa0328240, 0x98ca5b22, 0xa733b23a, 0x25a31660, 0x3f96405d}, {0x0411fe73, 0x95df4b36, 0xebc1e1bb, 0x1ef4e672, 0x60afca4a, 0x6e92a9c4, 0x753e4fcc, 0x4f2c596e}, {0xba60eaa6, 0x9733f3a6, 0x77487ae7, 0xbd7fdf9c, 0xc8b6cc00, 0xd84f8612, 0x6162ffab, 0x476fa2fb}, {0xac5db47f, 0xd2fc5e69, 0x15d0b8e4, 0xa12a70a6, 0xbc8de5d9, 0x293b1d67, 0x57f86f5e, 0x0e4840ac}, {0xab28e208, 0xb750da4c, 0x3be95635, 0x501dff64, 0xf0b4b276, 0x8cbe2437, 0xa94a946e, 0x07d0c802}, {0x2fe322b8, 0x2cabadec, 0x15412560, 0x752c84f3, 0x1a3b0aef, 0x32a732ae, 0xa33dcbf2, 0x2e95da59}, {0xfe0c65f4, 0x33811ea1, 0x687f28a2, 0x15c1ad4c, 0x42dee7f4, 0xecfbede3, 0x9a5d88b1, 0x1bb46667}, {0x2d010ff9, 0xd58a5af4, 0x570bf109, 0x79efd6b0, 0x6350721d, 0x3ed6d55a, 0x58f43cef, 0x2f27b098}, {0x8c130477, 0x74a1f671, 0xb61e0abe, 0xa534af14, 0x620890d7, 0xeb674a1a, 0xca252472, 0x43527a8b}, {0x7ea8ee05, 0x450d9f97, 0x37d56fc0, 0x565af171, 0x93f9e9ac, 0xe155cb48, 0xc8e9101b, 0x110cebd0}, {0x59a0be92, 0x23c91599, 0x7a027759, 0x87d188ce, 0xcab3c3cc, 0x70491431, 0xb3f7f8da, 0x0ac00eb8}, {0x69583404, 0x13e96ade, 0x5306243d, 0x82c05727, 0x29ca9f2a, 0x77e48bf5, 0x1fe19595, 0x50646ac8}, {0xa97eccd4, 0xe6a354dd, 0x88fbbc57, 0x39929d2e, 0xd6e7b1c8, 0xa22ba63d, 0xf5f07f43, 0x42c22911}, {0xcfc35f7a, 0x137b458a, 0x29c01b06, 0x0caba63a, 0x7a02402c, 0x0409ee98, 0x56aa725b, 0x6709c6cd}, {0x8831e03e, 0x10251f7d, 0x7ff858ec, 0x77d85a93, 0x4fb9ac5c, 0xebe905bd, 0xf8727901, 0x05deb333}, {0xb9009408, 0xbf87b689, 0xdd3ccc96, 0x4f730e7d, 0x4610300c, 0xfd7f05ba, 0x0b8ac903, 0x5ef5e8db}, {0x17cd0c14, 0x64996884, 0x68812f7f, 0xa6728673, 0x22cc3253, 0x2e1d9a19, 0xaa0a1d80, 0x3a689e83}, {0x41144dea, 0x20b53cbe, 0xc2f0fcbd, 0x870c46fa, 0x537d6971, 0x556c35f6, 0x5f686d91, 0x3436287f}, {0x436ba2e7, 0x007e082a, 0x9116e877, 0x67c6630f, 0xfb4460f7, 0x36f8f165, 0x7e7046e0, 0x6eee34d5}, {0xa53a56d1, 0xc5b670ee, 0x53037d7b, 0x127d1f42, 0xa722c2e2, 0x57d4257e, 0x33cbd838, 0x03ae26a3}, {0x76504cf8, 0x1e914848, 0xb63edd02, 0x55bbbf1e, 0x4e55aa02, 0xbcdafec8, 0x2dc0beb0, 0x5145c4cd}, {0x1ab70e2c, 0x5b90153a, 0x75fb0ab8, 0x8deffa31, 0x46900c95, 0xc553ae23, 0x6bd3118c, 0x1d31dcdc}, {0x59a2e8eb, 0x801c894c, 0xe12fc974, 0xbc535c5c, 0x47d39803, 0x95508d27, 0xac5d094f, 0x16d9d3cd}, {0xcca1d8be, 0x810fa372, 0x82e0bfa7, 0xc67b8c28, 0xe2d35bc2, 0xdbb4edf0, 0x5087c995, 0x712d1580}, {0xfd88f133, 0xeb162203, 0xf010ea74, 0xac96c38f, 0xe64cfc70, 0x4307987f, 0x37b7a114, 0x350fe98d}, {0x42f2a254, 0xaba2f518, 0xa71efc0c, 0x4d7f3c3a, 0xd274a80a, 0x97ae418d, 0x5e3e7682, 0x2967385d}, {0x575a0b79, 0x75c55c7b, 0x74a7ded1, 0x3ba4a157, 0xa04fccf3, 0xc3974d73, 0x4a939684, 0x705aba4f}, {0x14ebb608, 0x8409a9ea, 0x66bac611, 0xfad0084e, 0x811c1dfb, 0x04287254, 0x23b30c29, 0x086d072b}, {0x67e4756a, 0xb427c9b3, 0x02ebc38d, 0xc7537fb9, 0xcd6a205f, 0x51de21be, 0x7923597d, 0x6064ab72}, {0x0b912f1f, 0x1b788f50, 0x70b3e094, 0xc4024ff2, 0xd168d6c0, 0x0fd56dc8, 0x5b416b6f, 0x0212d79e}};
-    // Quick fix for linking issue
-    static constexpr storage<limbs_count> omega1=   {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753};
-    static constexpr storage<limbs_count> omega2=   {0x00000000, 0x00010000, 0x76030000, 0xec030002, 0x760304d0, 0x8d51ccce, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> omega3=   {0x688bc087, 0x8dd702cb, 0x78eaa4fe, 0xa0328240, 0x98ca5b22, 0xa733b23a, 0x25a31660, 0x3f96405d};
-    static constexpr storage<limbs_count> omega4=   {0x0411fe73, 0x95df4b36, 0xebc1e1bb, 0x1ef4e672, 0x60afca4a, 0x6e92a9c4, 0x753e4fcc, 0x4f2c596e};
-    static constexpr storage<limbs_count> omega5=   {0xba60eaa6, 0x9733f3a6, 0x77487ae7, 0xbd7fdf9c, 0xc8b6cc00, 0xd84f8612, 0x6162ffab, 0x476fa2fb};
-    static constexpr storage<limbs_count> omega6=   {0xac5db47f, 0xd2fc5e69, 0x15d0b8e4, 0xa12a70a6, 0xbc8de5d9, 0x293b1d67, 0x57f86f5e, 0x0e4840ac};
-    static constexpr storage<limbs_count> omega7=   {0xab28e208, 0xb750da4c, 0x3be95635, 0x501dff64, 0xf0b4b276, 0x8cbe2437, 0xa94a946e, 0x07d0c802};
-    static constexpr storage<limbs_count> omega8=   {0x2fe322b8, 0x2cabadec, 0x15412560, 0x752c84f3, 0x1a3b0aef, 0x32a732ae, 0xa33dcbf2, 0x2e95da59};
-    static constexpr storage<limbs_count> omega9=   {0xfe0c65f4, 0x33811ea1, 0x687f28a2, 0x15c1ad4c, 0x42dee7f4, 0xecfbede3, 0x9a5d88b1, 0x1bb46667};
-    static constexpr storage<limbs_count> omega10=  {0x2d010ff9, 0xd58a5af4, 0x570bf109, 0x79efd6b0, 0x6350721d, 0x3ed6d55a, 0x58f43cef, 0x2f27b098};
-    static constexpr storage<limbs_count> omega11=  {0x8c130477, 0x74a1f671, 0xb61e0abe, 0xa534af14, 0x620890d7, 0xeb674a1a, 0xca252472, 0x43527a8b};
-    static constexpr storage<limbs_count> omega12=  {0x7ea8ee05, 0x450d9f97, 0x37d56fc0, 0x565af171, 0x93f9e9ac, 0xe155cb48, 0xc8e9101b, 0x110cebd0};
-    static constexpr storage<limbs_count> omega13=  {0x59a0be92, 0x23c91599, 0x7a027759, 0x87d188ce, 0xcab3c3cc, 0x70491431, 0xb3f7f8da, 0x0ac00eb8};
-    static constexpr storage<limbs_count> omega14=  {0x69583404, 0x13e96ade, 0x5306243d, 0x82c05727, 0x29ca9f2a, 0x77e48bf5, 0x1fe19595, 0x50646ac8};
-    static constexpr storage<limbs_count> omega15=  {0xa97eccd4, 0xe6a354dd, 0x88fbbc57, 0x39929d2e, 0xd6e7b1c8, 0xa22ba63d, 0xf5f07f43, 0x42c22911};
-    static constexpr storage<limbs_count> omega16=  {0xcfc35f7a, 0x137b458a, 0x29c01b06, 0x0caba63a, 0x7a02402c, 0x0409ee98, 0x56aa725b, 0x6709c6cd};
-    static constexpr storage<limbs_count> omega17=  {0x8831e03e, 0x10251f7d, 0x7ff858ec, 0x77d85a93, 0x4fb9ac5c, 0xebe905bd, 0xf8727901, 0x05deb333};
-    static constexpr storage<limbs_count> omega18=  {0xb9009408, 0xbf87b689, 0xdd3ccc96, 0x4f730e7d, 0x4610300c, 0xfd7f05ba, 0x0b8ac903, 0x5ef5e8db};
-    static constexpr storage<limbs_count> omega19=  {0x17cd0c14, 0x64996884, 0x68812f7f, 0xa6728673, 0x22cc3253, 0x2e1d9a19, 0xaa0a1d80, 0x3a689e83};
-    static constexpr storage<limbs_count> omega20=  {0x41144dea, 0x20b53cbe, 0xc2f0fcbd, 0x870c46fa, 0x537d6971, 0x556c35f6, 0x5f686d91, 0x3436287f};
-    static constexpr storage<limbs_count> omega21=  {0x436ba2e7, 0x007e082a, 0x9116e877, 0x67c6630f, 0xfb4460f7, 0x36f8f165, 0x7e7046e0, 0x6eee34d5};
-    static constexpr storage<limbs_count> omega22=  {0xa53a56d1, 0xc5b670ee, 0x53037d7b, 0x127d1f42, 0xa722c2e2, 0x57d4257e, 0x33cbd838, 0x03ae26a3};
-    static constexpr storage<limbs_count> omega23=  {0x76504cf8, 0x1e914848, 0xb63edd02, 0x55bbbf1e, 0x4e55aa02, 0xbcdafec8, 0x2dc0beb0, 0x5145c4cd};
-    static constexpr storage<limbs_count> omega24=  {0x1ab70e2c, 0x5b90153a, 0x75fb0ab8, 0x8deffa31, 0x46900c95, 0xc553ae23, 0x6bd3118c, 0x1d31dcdc};
-    static constexpr storage<limbs_count> omega25=  {0x59a2e8eb, 0x801c894c, 0xe12fc974, 0xbc535c5c, 0x47d39803, 0x95508d27, 0xac5d094f, 0x16d9d3cd};
-    static constexpr storage<limbs_count> omega26=  {0xcca1d8be, 0x810fa372, 0x82e0bfa7, 0xc67b8c28, 0xe2d35bc2, 0xdbb4edf0, 0x5087c995, 0x712d1580};
-    static constexpr storage<limbs_count> omega27=  {0xfd88f133, 0xeb162203, 0xf010ea74, 0xac96c38f, 0xe64cfc70, 0x4307987f, 0x37b7a114, 0x350fe98d};
-    static constexpr storage<limbs_count> omega28=  {0x42f2a254, 0xaba2f518, 0xa71efc0c, 0x4d7f3c3a, 0xd274a80a, 0x97ae418d, 0x5e3e7682, 0x2967385d};
-    static constexpr storage<limbs_count> omega29=  {0x575a0b79, 0x75c55c7b, 0x74a7ded1, 0x3ba4a157, 0xa04fccf3, 0xc3974d73, 0x4a939684, 0x705aba4f};
-    static constexpr storage<limbs_count> omega30=  {0x14ebb608, 0x8409a9ea, 0x66bac611, 0xfad0084e, 0x811c1dfb, 0x04287254, 0x23b30c29, 0x086d072b};
-    static constexpr storage<limbs_count> omega31=  {0x67e4756a, 0xb427c9b3, 0x02ebc38d, 0xc7537fb9, 0xcd6a205f, 0x51de21be, 0x7923597d, 0x6064ab72};
-    static constexpr storage<limbs_count> omega32=  {0x0b912f1f, 0x1b788f50, 0x70b3e094, 0xc4024ff2, 0xd168d6c0, 0x0fd56dc8, 0x5b416b6f, 0x0212d79e};
+    static constexpr storage<limbs_count> m = {0x830358e4, 0x509cde80, 0x2f92eb5c, 0xd9410fad,
+                                               0xc1f823b4, 0xe2d772d,  0x7fb78ddf, 0x8d54253b};
+
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0xfffffffe, 0x00000001, 0x00034802, 0x5884b7fa,
+                                                          0xecbc4ff5, 0x998c4fef, 0xacc5056f, 0x1824b159};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0xfe75c040, 0x13f75b69, 0x09dc705f, 0xab6fca8f,
+                                                              0x4f77266a, 0x7204078a, 0x30009d57, 0x1bbe8693};
+
+    // static constexpr storage<limbs_count> omega[32]= { {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805,
+    // 0x3339d808, 0x299d7d48, 0x73eda753}, {0x00000000, 0x00010000, 0x76030000, 0xec030002, 0x760304d0, 0x8d51ccce,
+    // 0x00000000, 0x00000000}, {0x688bc087, 0x8dd702cb, 0x78eaa4fe, 0xa0328240, 0x98ca5b22, 0xa733b23a, 0x25a31660,
+    // 0x3f96405d}, {0x0411fe73, 0x95df4b36, 0xebc1e1bb, 0x1ef4e672, 0x60afca4a, 0x6e92a9c4, 0x753e4fcc, 0x4f2c596e},
+    // {0xba60eaa6, 0x9733f3a6, 0x77487ae7, 0xbd7fdf9c, 0xc8b6cc00, 0xd84f8612, 0x6162ffab, 0x476fa2fb}, {0xac5db47f,
+    // 0xd2fc5e69, 0x15d0b8e4, 0xa12a70a6, 0xbc8de5d9, 0x293b1d67, 0x57f86f5e, 0x0e4840ac}, {0xab28e208, 0xb750da4c,
+    // 0x3be95635, 0x501dff64, 0xf0b4b276, 0x8cbe2437, 0xa94a946e, 0x07d0c802}, {0x2fe322b8, 0x2cabadec, 0x15412560,
+    // 0x752c84f3, 0x1a3b0aef, 0x32a732ae, 0xa33dcbf2, 0x2e95da59}, {0xfe0c65f4, 0x33811ea1, 0x687f28a2, 0x15c1ad4c,
+    // 0x42dee7f4, 0xecfbede3, 0x9a5d88b1, 0x1bb46667}, {0x2d010ff9, 0xd58a5af4, 0x570bf109, 0x79efd6b0, 0x6350721d,
+    // 0x3ed6d55a, 0x58f43cef, 0x2f27b098}, {0x8c130477, 0x74a1f671, 0xb61e0abe, 0xa534af14, 0x620890d7, 0xeb674a1a,
+    // 0xca252472, 0x43527a8b}, {0x7ea8ee05, 0x450d9f97, 0x37d56fc0, 0x565af171, 0x93f9e9ac, 0xe155cb48, 0xc8e9101b,
+    // 0x110cebd0}, {0x59a0be92, 0x23c91599, 0x7a027759, 0x87d188ce, 0xcab3c3cc, 0x70491431, 0xb3f7f8da, 0x0ac00eb8},
+    // {0x69583404, 0x13e96ade, 0x5306243d, 0x82c05727, 0x29ca9f2a, 0x77e48bf5, 0x1fe19595, 0x50646ac8}, {0xa97eccd4,
+    // 0xe6a354dd, 0x88fbbc57, 0x39929d2e, 0xd6e7b1c8, 0xa22ba63d, 0xf5f07f43, 0x42c22911}, {0xcfc35f7a, 0x137b458a,
+    // 0x29c01b06, 0x0caba63a, 0x7a02402c, 0x0409ee98, 0x56aa725b, 0x6709c6cd}, {0x8831e03e, 0x10251f7d, 0x7ff858ec,
+    // 0x77d85a93, 0x4fb9ac5c, 0xebe905bd, 0xf8727901, 0x05deb333}, {0xb9009408, 0xbf87b689, 0xdd3ccc96, 0x4f730e7d,
+    // 0x4610300c, 0xfd7f05ba, 0x0b8ac903, 0x5ef5e8db}, {0x17cd0c14, 0x64996884, 0x68812f7f, 0xa6728673, 0x22cc3253,
+    // 0x2e1d9a19, 0xaa0a1d80, 0x3a689e83}, {0x41144dea, 0x20b53cbe, 0xc2f0fcbd, 0x870c46fa, 0x537d6971, 0x556c35f6,
+    // 0x5f686d91, 0x3436287f}, {0x436ba2e7, 0x007e082a, 0x9116e877, 0x67c6630f, 0xfb4460f7, 0x36f8f165, 0x7e7046e0,
+    // 0x6eee34d5}, {0xa53a56d1, 0xc5b670ee, 0x53037d7b, 0x127d1f42, 0xa722c2e2, 0x57d4257e, 0x33cbd838, 0x03ae26a3},
+    // {0x76504cf8, 0x1e914848, 0xb63edd02, 0x55bbbf1e, 0x4e55aa02, 0xbcdafec8, 0x2dc0beb0, 0x5145c4cd}, {0x1ab70e2c,
+    // 0x5b90153a, 0x75fb0ab8, 0x8deffa31, 0x46900c95, 0xc553ae23, 0x6bd3118c, 0x1d31dcdc}, {0x59a2e8eb, 0x801c894c,
+    // 0xe12fc974, 0xbc535c5c, 0x47d39803, 0x95508d27, 0xac5d094f, 0x16d9d3cd}, {0xcca1d8be, 0x810fa372, 0x82e0bfa7,
+    // 0xc67b8c28, 0xe2d35bc2, 0xdbb4edf0, 0x5087c995, 0x712d1580}, {0xfd88f133, 0xeb162203, 0xf010ea74, 0xac96c38f,
+    // 0xe64cfc70, 0x4307987f, 0x37b7a114, 0x350fe98d}, {0x42f2a254, 0xaba2f518, 0xa71efc0c, 0x4d7f3c3a, 0xd274a80a,
+    // 0x97ae418d, 0x5e3e7682, 0x2967385d}, {0x575a0b79, 0x75c55c7b, 0x74a7ded1, 0x3ba4a157, 0xa04fccf3, 0xc3974d73,
+    // 0x4a939684, 0x705aba4f}, {0x14ebb608, 0x8409a9ea, 0x66bac611, 0xfad0084e, 0x811c1dfb, 0x04287254, 0x23b30c29,
+    // 0x086d072b}, {0x67e4756a, 0xb427c9b3, 0x02ebc38d, 0xc7537fb9, 0xcd6a205f, 0x51de21be, 0x7923597d, 0x6064ab72},
+    // {0x0b912f1f, 0x1b788f50, 0x70b3e094, 0xc4024ff2, 0xd168d6c0, 0x0fd56dc8, 0x5b416b6f, 0x0212d79e}}; Quick fix for
+    // linking issue
+    static constexpr storage<limbs_count> omega1 = {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402,
+                                                    0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753};
+    static constexpr storage<limbs_count> omega2 = {0x00000000, 0x00010000, 0x76030000, 0xec030002,
+                                                    0x760304d0, 0x8d51ccce, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> omega3 = {0x688bc087, 0x8dd702cb, 0x78eaa4fe, 0xa0328240,
+                                                    0x98ca5b22, 0xa733b23a, 0x25a31660, 0x3f96405d};
+    static constexpr storage<limbs_count> omega4 = {0x0411fe73, 0x95df4b36, 0xebc1e1bb, 0x1ef4e672,
+                                                    0x60afca4a, 0x6e92a9c4, 0x753e4fcc, 0x4f2c596e};
+    static constexpr storage<limbs_count> omega5 = {0xba60eaa6, 0x9733f3a6, 0x77487ae7, 0xbd7fdf9c,
+                                                    0xc8b6cc00, 0xd84f8612, 0x6162ffab, 0x476fa2fb};
+    static constexpr storage<limbs_count> omega6 = {0xac5db47f, 0xd2fc5e69, 0x15d0b8e4, 0xa12a70a6,
+                                                    0xbc8de5d9, 0x293b1d67, 0x57f86f5e, 0x0e4840ac};
+    static constexpr storage<limbs_count> omega7 = {0xab28e208, 0xb750da4c, 0x3be95635, 0x501dff64,
+                                                    0xf0b4b276, 0x8cbe2437, 0xa94a946e, 0x07d0c802};
+    static constexpr storage<limbs_count> omega8 = {0x2fe322b8, 0x2cabadec, 0x15412560, 0x752c84f3,
+                                                    0x1a3b0aef, 0x32a732ae, 0xa33dcbf2, 0x2e95da59};
+    static constexpr storage<limbs_count> omega9 = {0xfe0c65f4, 0x33811ea1, 0x687f28a2, 0x15c1ad4c,
+                                                    0x42dee7f4, 0xecfbede3, 0x9a5d88b1, 0x1bb46667};
+    static constexpr storage<limbs_count> omega10 = {0x2d010ff9, 0xd58a5af4, 0x570bf109, 0x79efd6b0,
+                                                     0x6350721d, 0x3ed6d55a, 0x58f43cef, 0x2f27b098};
+    static constexpr storage<limbs_count> omega11 = {0x8c130477, 0x74a1f671, 0xb61e0abe, 0xa534af14,
+                                                     0x620890d7, 0xeb674a1a, 0xca252472, 0x43527a8b};
+    static constexpr storage<limbs_count> omega12 = {0x7ea8ee05, 0x450d9f97, 0x37d56fc0, 0x565af171,
+                                                     0x93f9e9ac, 0xe155cb48, 0xc8e9101b, 0x110cebd0};
+    static constexpr storage<limbs_count> omega13 = {0x59a0be92, 0x23c91599, 0x7a027759, 0x87d188ce,
+                                                     0xcab3c3cc, 0x70491431, 0xb3f7f8da, 0x0ac00eb8};
+    static constexpr storage<limbs_count> omega14 = {0x69583404, 0x13e96ade, 0x5306243d, 0x82c05727,
+                                                     0x29ca9f2a, 0x77e48bf5, 0x1fe19595, 0x50646ac8};
+    static constexpr storage<limbs_count> omega15 = {0xa97eccd4, 0xe6a354dd, 0x88fbbc57, 0x39929d2e,
+                                                     0xd6e7b1c8, 0xa22ba63d, 0xf5f07f43, 0x42c22911};
+    static constexpr storage<limbs_count> omega16 = {0xcfc35f7a, 0x137b458a, 0x29c01b06, 0x0caba63a,
+                                                     0x7a02402c, 0x0409ee98, 0x56aa725b, 0x6709c6cd};
+    static constexpr storage<limbs_count> omega17 = {0x8831e03e, 0x10251f7d, 0x7ff858ec, 0x77d85a93,
+                                                     0x4fb9ac5c, 0xebe905bd, 0xf8727901, 0x05deb333};
+    static constexpr storage<limbs_count> omega18 = {0xb9009408, 0xbf87b689, 0xdd3ccc96, 0x4f730e7d,
+                                                     0x4610300c, 0xfd7f05ba, 0x0b8ac903, 0x5ef5e8db};
+    static constexpr storage<limbs_count> omega19 = {0x17cd0c14, 0x64996884, 0x68812f7f, 0xa6728673,
+                                                     0x22cc3253, 0x2e1d9a19, 0xaa0a1d80, 0x3a689e83};
+    static constexpr storage<limbs_count> omega20 = {0x41144dea, 0x20b53cbe, 0xc2f0fcbd, 0x870c46fa,
+                                                     0x537d6971, 0x556c35f6, 0x5f686d91, 0x3436287f};
+    static constexpr storage<limbs_count> omega21 = {0x436ba2e7, 0x007e082a, 0x9116e877, 0x67c6630f,
+                                                     0xfb4460f7, 0x36f8f165, 0x7e7046e0, 0x6eee34d5};
+    static constexpr storage<limbs_count> omega22 = {0xa53a56d1, 0xc5b670ee, 0x53037d7b, 0x127d1f42,
+                                                     0xa722c2e2, 0x57d4257e, 0x33cbd838, 0x03ae26a3};
+    static constexpr storage<limbs_count> omega23 = {0x76504cf8, 0x1e914848, 0xb63edd02, 0x55bbbf1e,
+                                                     0x4e55aa02, 0xbcdafec8, 0x2dc0beb0, 0x5145c4cd};
+    static constexpr storage<limbs_count> omega24 = {0x1ab70e2c, 0x5b90153a, 0x75fb0ab8, 0x8deffa31,
+                                                     0x46900c95, 0xc553ae23, 0x6bd3118c, 0x1d31dcdc};
+    static constexpr storage<limbs_count> omega25 = {0x59a2e8eb, 0x801c894c, 0xe12fc974, 0xbc535c5c,
+                                                     0x47d39803, 0x95508d27, 0xac5d094f, 0x16d9d3cd};
+    static constexpr storage<limbs_count> omega26 = {0xcca1d8be, 0x810fa372, 0x82e0bfa7, 0xc67b8c28,
+                                                     0xe2d35bc2, 0xdbb4edf0, 0x5087c995, 0x712d1580};
+    static constexpr storage<limbs_count> omega27 = {0xfd88f133, 0xeb162203, 0xf010ea74, 0xac96c38f,
+                                                     0xe64cfc70, 0x4307987f, 0x37b7a114, 0x350fe98d};
+    static constexpr storage<limbs_count> omega28 = {0x42f2a254, 0xaba2f518, 0xa71efc0c, 0x4d7f3c3a,
+                                                     0xd274a80a, 0x97ae418d, 0x5e3e7682, 0x2967385d};
+    static constexpr storage<limbs_count> omega29 = {0x575a0b79, 0x75c55c7b, 0x74a7ded1, 0x3ba4a157,
+                                                     0xa04fccf3, 0xc3974d73, 0x4a939684, 0x705aba4f};
+    static constexpr storage<limbs_count> omega30 = {0x14ebb608, 0x8409a9ea, 0x66bac611, 0xfad0084e,
+                                                     0x811c1dfb, 0x04287254, 0x23b30c29, 0x086d072b};
+    static constexpr storage<limbs_count> omega31 = {0x67e4756a, 0xb427c9b3, 0x02ebc38d, 0xc7537fb9,
+                                                     0xcd6a205f, 0x51de21be, 0x7923597d, 0x6064ab72};
+    static constexpr storage<limbs_count> omega32 = {0x0b912f1f, 0x1b788f50, 0x70b3e094, 0xc4024ff2,
+                                                     0xd168d6c0, 0x0fd56dc8, 0x5b416b6f, 0x0212d79e};

    static constexpr storage_array<omegas_count, limbs_count> omega = {
-        omega1, omega2, omega3, omega4, omega5, omega6, omega7, omega8, 
-        omega9, omega10, omega11, omega12, omega13, omega14, omega15, omega16,
-        omega17, omega18, omega19, omega20, omega21, omega22, omega23, omega24,
-        omega25, omega26, omega27, omega28, omega29, omega30, omega31, omega32,
+      omega1,  omega2,  omega3,  omega4,  omega5,  omega6,  omega7,  omega8,  omega9,  omega10, omega11,
+      omega12, omega13, omega14, omega15, omega16, omega17, omega18, omega19, omega20, omega21, omega22,
+      omega23, omega24, omega25, omega26, omega27, omega28, omega29, omega30, omega31, omega32,
    };
-  
-    // static constexpr storage<limbs_count> omega_inv[32]={ {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753}, {0x00000001, 0xfffeffff, 0x89fb5bfe, 0x67baa400, 0x939ed334, 0xa5e80b39, 0x299d7d47, 0x73eda753}, {0xae99502e, 0x6037fe81, 0x94b04fd8, 0x8e749036, 0xca86bf65, 0xbabc5aff, 0x5ce11044, 0x1333b22e}, {0x7dc08d74, 0x7f847ee4, 0x04eeaf5a, 0xbd433896, 0x1832fc60, 0xd66c91d6, 0x607e449b, 0x551115b4}, {0x4e7773cb, 0xee5bcecc, 0xf6dab086, 0x45593d6f, 0x4016e2bd, 0xa3a95d2d, 0xaf96816f, 0x047cb16c}, {0x982b68c5, 0xb891fa3f, 0x1d426b52, 0xa41e8501, 0x882952d6, 0x566009b5, 0x7b3c79d6, 0x199cdaee}, {0xcf28601b, 0x571ba2fc, 0xac74db12, 0x166fb582, 0x3501370b, 0x51420be4, 0x52f970ba, 0x1996fa8d}, {0x6a2f777a, 0xe9561c17, 0x2393991b, 0xc03cae03, 0x5a5bfd4f, 0x91b00023, 0x272e58ee, 0x6d64ed25}, {0xf02a116e, 0xfb350dbe, 0xb4543a3e, 0x1c510ebf, 0x37ad4eca, 0xf675522e, 0x80f82b2d, 0x1907a56e}, {0x4eb71aa6, 0xb0ad8003, 0xaa67e0be, 0x50a32c41, 0x19141f44, 0x105f0672, 0xa3dad316, 0x2bcd9508}, {0x0f6fb2ac, 0x3dc9e560, 0x9aa58ff5, 0x3cc5bb32, 0x36f376e1, 0xdeae67bc, 0x65ba213e, 0x394fda0d}, {0x60b82267, 0x09f239f7, 0x8b24f123, 0x14180e0e, 0x45625d95, 0xad5a5340, 0x6d174692, 0x58c3ba63}, {0x348b416f, 0x0acf21c2, 0xbc086439, 0x798b6bf6, 0xb1ca111d, 0x222d411f, 0x30ba1e0f, 0x044107b7}, {0x014abe84, 0xa3b861b8, 0x427ed008, 0x37c017e4, 0xae0ff4f5, 0xae51f613, 0xcb1218d3, 0x1a2d00e1}, {0x4de7eb2b, 0x48aaa3bf, 0x6772057d, 0x4a58d54d, 0x7093b551, 0xce25f16c, 0xd206337c, 0x242150ac}, {0x9ed57ae5, 0xdf3ec9ae, 0x7166577f, 0xea7df73a, 0x022fbbe4, 0x6ca8d281, 0x151e3f6b, 0x5850c003}, {0x645e1cfa, 0x903a0a0c, 0x34788c37, 0xfbac54cb, 0x8cf73d78, 0xdc127d11, 0x975d3c82, 0x6d0b5c7c}, {0x14b1ba04, 0xb49d6b05, 0xf00b84f2, 0x56e466b4, 0x0b904f22, 0x30c390cf, 0x3ee254cc, 0x3e11cfb7}, {0xbe8201ab, 0x84dfa547, 0x530715d2, 0x3887ce8b, 0x3eed4ed7, 0xa4c719c6, 0x8f8007b4, 0x18c44950}, {0x7d813cd1, 0xdaf0346d, 0xf755beb1, 0xeccf6f9a, 0xe08143e3, 0x167fce38, 0x6f5d6dfa, 0x545ad9b2}, {0x577605de, 0x973f5466, 0x974f953c, 0x0ce8986e, 0x074382f9, 0x8941cf4b, 0x6fa2672c, 0x156cd7f6}, {0x33b66141, 0x24315404, 0x1992f584, 0x5d1375ab, 0x8b20ca1a, 0xf193ffa6, 0x2701a503, 0x47880cd5}, {0xe9f7b9af, 0xf7b6847d, 0x62c83ce2, 0x9a339673, 0x6e5e6f79, 0xfabf4537, 0x35af33a3, 0x0975acd9}, {0x0eddd248, 0x4fb4204a, 0xc9e509b3, 0x8c98706a, 0x2bb27eb1, 0xd0be8987, 0xc831438b, 0x6ec5f960}, {0x20238f62, 0xa13c95b7, 0x83b476b9, 0x130aa097, 0x14860881, 0x758a04e0, 0x97066493, 0x58e2f8d6}, {0xe8bff41e, 0x65b09c73, 0x37f1c6a3, 0x8b3280e8, 0x2846fb21, 0xe17b82ce, 0xb1ae27df, 0x476534bf}, {0xd5fdb757, 0x8480c0e7, 0x365bf9fd, 0x3644eea0, 0xb776be86, 0x4ca116ca, 0x8b58390c, 0x17b6395f}, {0x252eb0db, 0x2c811e9a, 0x7479e161, 0x1b7d960d, 0xb0a89a26, 0xb3afc7c1, 0x32b5e793, 0x6a2f9533}, {0x08b8a7ad, 0xe877b2c4, 0x341652b4, 0x68b0e8f0, 0xe8b6a2d9, 0x2d44da3b, 0xfd09be59, 0x092778ff}, {0x7988f244, 0x84a1aa6f, 0x24faf63f, 0xa164b3d9, 0xc1bbb915, 0x7aae9724, 0xf386c0d2, 0x24e5d287}, {0x41a1b30c, 0xa70a7efd, 0x39f0e511, 0xc49c55a5, 0x033bb323, 0xab307a8f, 0x17acbd7f, 0x0158abd6}, {0x0f642025, 0x2c228b30, 0x01bd882b, 0xb0878e8d, 0xd7377fea, 0xd862b255, 0xf0490536, 0x18ac3666}};
+
+    // static constexpr storage<limbs_count> omega_inv[32]={ {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402,
+    // 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753}, {0x00000001, 0xfffeffff, 0x89fb5bfe, 0x67baa400, 0x939ed334,
+    // 0xa5e80b39, 0x299d7d47, 0x73eda753}, {0xae99502e, 0x6037fe81, 0x94b04fd8, 0x8e749036, 0xca86bf65, 0xbabc5aff,
+    // 0x5ce11044, 0x1333b22e}, {0x7dc08d74, 0x7f847ee4, 0x04eeaf5a, 0xbd433896, 0x1832fc60, 0xd66c91d6, 0x607e449b,
+    // 0x551115b4}, {0x4e7773cb, 0xee5bcecc, 0xf6dab086, 0x45593d6f, 0x4016e2bd, 0xa3a95d2d, 0xaf96816f, 0x047cb16c},
+    // {0x982b68c5, 0xb891fa3f, 0x1d426b52, 0xa41e8501, 0x882952d6, 0x566009b5, 0x7b3c79d6, 0x199cdaee}, {0xcf28601b,
+    // 0x571ba2fc, 0xac74db12, 0x166fb582, 0x3501370b, 0x51420be4, 0x52f970ba, 0x1996fa8d}, {0x6a2f777a, 0xe9561c17,
+    // 0x2393991b, 0xc03cae03, 0x5a5bfd4f, 0x91b00023, 0x272e58ee, 0x6d64ed25}, {0xf02a116e, 0xfb350dbe, 0xb4543a3e,
+    // 0x1c510ebf, 0x37ad4eca, 0xf675522e, 0x80f82b2d, 0x1907a56e}, {0x4eb71aa6, 0xb0ad8003, 0xaa67e0be, 0x50a32c41,
+    // 0x19141f44, 0x105f0672, 0xa3dad316, 0x2bcd9508}, {0x0f6fb2ac, 0x3dc9e560, 0x9aa58ff5, 0x3cc5bb32, 0x36f376e1,
+    // 0xdeae67bc, 0x65ba213e, 0x394fda0d}, {0x60b82267, 0x09f239f7, 0x8b24f123, 0x14180e0e, 0x45625d95, 0xad5a5340,
+    // 0x6d174692, 0x58c3ba63}, {0x348b416f, 0x0acf21c2, 0xbc086439, 0x798b6bf6, 0xb1ca111d, 0x222d411f, 0x30ba1e0f,
+    // 0x044107b7}, {0x014abe84, 0xa3b861b8, 0x427ed008, 0x37c017e4, 0xae0ff4f5, 0xae51f613, 0xcb1218d3, 0x1a2d00e1},
+    // {0x4de7eb2b, 0x48aaa3bf, 0x6772057d, 0x4a58d54d, 0x7093b551, 0xce25f16c, 0xd206337c, 0x242150ac}, {0x9ed57ae5,
+    // 0xdf3ec9ae, 0x7166577f, 0xea7df73a, 0x022fbbe4, 0x6ca8d281, 0x151e3f6b, 0x5850c003}, {0x645e1cfa, 0x903a0a0c,
+    // 0x34788c37, 0xfbac54cb, 0x8cf73d78, 0xdc127d11, 0x975d3c82, 0x6d0b5c7c}, {0x14b1ba04, 0xb49d6b05, 0xf00b84f2,
+    // 0x56e466b4, 0x0b904f22, 0x30c390cf, 0x3ee254cc, 0x3e11cfb7}, {0xbe8201ab, 0x84dfa547, 0x530715d2, 0x3887ce8b,
+    // 0x3eed4ed7, 0xa4c719c6, 0x8f8007b4, 0x18c44950}, {0x7d813cd1, 0xdaf0346d, 0xf755beb1, 0xeccf6f9a, 0xe08143e3,
+    // 0x167fce38, 0x6f5d6dfa, 0x545ad9b2}, {0x577605de, 0x973f5466, 0x974f953c, 0x0ce8986e, 0x074382f9, 0x8941cf4b,
+    // 0x6fa2672c, 0x156cd7f6}, {0x33b66141, 0x24315404, 0x1992f584, 0x5d1375ab, 0x8b20ca1a, 0xf193ffa6, 0x2701a503,
+    // 0x47880cd5}, {0xe9f7b9af, 0xf7b6847d, 0x62c83ce2, 0x9a339673, 0x6e5e6f79, 0xfabf4537, 0x35af33a3, 0x0975acd9},
+    // {0x0eddd248, 0x4fb4204a, 0xc9e509b3, 0x8c98706a, 0x2bb27eb1, 0xd0be8987, 0xc831438b, 0x6ec5f960}, {0x20238f62,
+    // 0xa13c95b7, 0x83b476b9, 0x130aa097, 0x14860881, 0x758a04e0, 0x97066493, 0x58e2f8d6}, {0xe8bff41e, 0x65b09c73,
+    // 0x37f1c6a3, 0x8b3280e8, 0x2846fb21, 0xe17b82ce, 0xb1ae27df, 0x476534bf}, {0xd5fdb757, 0x8480c0e7, 0x365bf9fd,
+    // 0x3644eea0, 0xb776be86, 0x4ca116ca, 0x8b58390c, 0x17b6395f}, {0x252eb0db, 0x2c811e9a, 0x7479e161, 0x1b7d960d,
+    // 0xb0a89a26, 0xb3afc7c1, 0x32b5e793, 0x6a2f9533}, {0x08b8a7ad, 0xe877b2c4, 0x341652b4, 0x68b0e8f0, 0xe8b6a2d9,
+    // 0x2d44da3b, 0xfd09be59, 0x092778ff}, {0x7988f244, 0x84a1aa6f, 0x24faf63f, 0xa164b3d9, 0xc1bbb915, 0x7aae9724,
+    // 0xf386c0d2, 0x24e5d287}, {0x41a1b30c, 0xa70a7efd, 0x39f0e511, 0xc49c55a5, 0x033bb323, 0xab307a8f, 0x17acbd7f,
+    // 0x0158abd6}, {0x0f642025, 0x2c228b30, 0x01bd882b, 0xb0878e8d, 0xd7377fea, 0xd862b255, 0xf0490536, 0x18ac3666}};
    // Quick fix for linking issue
-    static constexpr storage<limbs_count> omega_inv1=   {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753};
-    static constexpr storage<limbs_count> omega_inv2=   {0x00000001, 0xfffeffff, 0x89fb5bfe, 0x67baa400, 0x939ed334, 0xa5e80b39, 0x299d7d47, 0x73eda753};
-    static constexpr storage<limbs_count> omega_inv3=   {0xae99502e, 0x6037fe81, 0x94b04fd8, 0x8e749036, 0xca86bf65, 0xbabc5aff, 0x5ce11044, 0x1333b22e};
-    static constexpr storage<limbs_count> omega_inv4=   {0x7dc08d74, 0x7f847ee4, 0x04eeaf5a, 0xbd433896, 0x1832fc60, 0xd66c91d6, 0x607e449b, 0x551115b4};
-    static constexpr storage<limbs_count> omega_inv5=   {0x4e7773cb, 0xee5bcecc, 0xf6dab086, 0x45593d6f, 0x4016e2bd, 0xa3a95d2d, 0xaf96816f, 0x047cb16c};
-    static constexpr storage<limbs_count> omega_inv6=   {0x982b68c5, 0xb891fa3f, 0x1d426b52, 0xa41e8501, 0x882952d6, 0x566009b5, 0x7b3c79d6, 0x199cdaee};
-    static constexpr storage<limbs_count> omega_inv7=   {0xcf28601b, 0x571ba2fc, 0xac74db12, 0x166fb582, 0x3501370b, 0x51420be4, 0x52f970ba, 0x1996fa8d};
-    static constexpr storage<limbs_count> omega_inv8=   {0x6a2f777a, 0xe9561c17, 0x2393991b, 0xc03cae03, 0x5a5bfd4f, 0x91b00023, 0x272e58ee, 0x6d64ed25};
-    static constexpr storage<limbs_count> omega_inv9=   {0xf02a116e, 0xfb350dbe, 0xb4543a3e, 0x1c510ebf, 0x37ad4eca, 0xf675522e, 0x80f82b2d, 0x1907a56e};
-    static constexpr storage<limbs_count> omega_inv10=  {0x4eb71aa6, 0xb0ad8003, 0xaa67e0be, 0x50a32c41, 0x19141f44, 0x105f0672, 0xa3dad316, 0x2bcd9508};
-    static constexpr storage<limbs_count> omega_inv11=  {0x0f6fb2ac, 0x3dc9e560, 0x9aa58ff5, 0x3cc5bb32, 0x36f376e1, 0xdeae67bc, 0x65ba213e, 0x394fda0d};
-    static constexpr storage<limbs_count> omega_inv12=  {0x60b82267, 0x09f239f7, 0x8b24f123, 0x14180e0e, 0x45625d95, 0xad5a5340, 0x6d174692, 0x58c3ba63};
-    static constexpr storage<limbs_count> omega_inv13=  {0x348b416f, 0x0acf21c2, 0xbc086439, 0x798b6bf6, 0xb1ca111d, 0x222d411f, 0x30ba1e0f, 0x044107b7};
-    static constexpr storage<limbs_count> omega_inv14=  {0x014abe84, 0xa3b861b8, 0x427ed008, 0x37c017e4, 0xae0ff4f5, 0xae51f613, 0xcb1218d3, 0x1a2d00e1};
-    static constexpr storage<limbs_count> omega_inv15=  {0x4de7eb2b, 0x48aaa3bf, 0x6772057d, 0x4a58d54d, 0x7093b551, 0xce25f16c, 0xd206337c, 0x242150ac};
-    static constexpr storage<limbs_count> omega_inv16=  {0x9ed57ae5, 0xdf3ec9ae, 0x7166577f, 0xea7df73a, 0x022fbbe4, 0x6ca8d281, 0x151e3f6b, 0x5850c003};
-    static constexpr storage<limbs_count> omega_inv17=  {0x645e1cfa, 0x903a0a0c, 0x34788c37, 0xfbac54cb, 0x8cf73d78, 0xdc127d11, 0x975d3c82, 0x6d0b5c7c};
-    static constexpr storage<limbs_count> omega_inv18=  {0x14b1ba04, 0xb49d6b05, 0xf00b84f2, 0x56e466b4, 0x0b904f22, 0x30c390cf, 0x3ee254cc, 0x3e11cfb7};
-    static constexpr storage<limbs_count> omega_inv19=  {0xbe8201ab, 0x84dfa547, 0x530715d2, 0x3887ce8b, 0x3eed4ed7, 0xa4c719c6, 0x8f8007b4, 0x18c44950};
-    static constexpr storage<limbs_count> omega_inv20=  {0x7d813cd1, 0xdaf0346d, 0xf755beb1, 0xeccf6f9a, 0xe08143e3, 0x167fce38, 0x6f5d6dfa, 0x545ad9b2};
-    static constexpr storage<limbs_count> omega_inv21=  {0x577605de, 0x973f5466, 0x974f953c, 0x0ce8986e, 0x074382f9, 0x8941cf4b, 0x6fa2672c, 0x156cd7f6};
-    static constexpr storage<limbs_count> omega_inv22=  {0x33b66141, 0x24315404, 0x1992f584, 0x5d1375ab, 0x8b20ca1a, 0xf193ffa6, 0x2701a503, 0x47880cd5};
-    static constexpr storage<limbs_count> omega_inv23=  {0xe9f7b9af, 0xf7b6847d, 0x62c83ce2, 0x9a339673, 0x6e5e6f79, 0xfabf4537, 0x35af33a3, 0x0975acd9};
-    static constexpr storage<limbs_count> omega_inv24=  {0x0eddd248, 0x4fb4204a, 0xc9e509b3, 0x8c98706a, 0x2bb27eb1, 0xd0be8987, 0xc831438b, 0x6ec5f960};
-    static constexpr storage<limbs_count> omega_inv25=  {0x20238f62, 0xa13c95b7, 0x83b476b9, 0x130aa097, 0x14860881, 0x758a04e0, 0x97066493, 0x58e2f8d6};
-    static constexpr storage<limbs_count> omega_inv26=  {0xe8bff41e, 0x65b09c73, 0x37f1c6a3, 0x8b3280e8, 0x2846fb21, 0xe17b82ce, 0xb1ae27df, 0x476534bf};
-    static constexpr storage<limbs_count> omega_inv27=  {0xd5fdb757, 0x8480c0e7, 0x365bf9fd, 0x3644eea0, 0xb776be86, 0x4ca116ca, 0x8b58390c, 0x17b6395f};
-    static constexpr storage<limbs_count> omega_inv28=  {0x252eb0db, 0x2c811e9a, 0x7479e161, 0x1b7d960d, 0xb0a89a26, 0xb3afc7c1, 0x32b5e793, 0x6a2f9533};
-    static constexpr storage<limbs_count> omega_inv29=  {0x08b8a7ad, 0xe877b2c4, 0x341652b4, 0x68b0e8f0, 0xe8b6a2d9, 0x2d44da3b, 0xfd09be59, 0x092778ff};
-    static constexpr storage<limbs_count> omega_inv30=  {0x7988f244, 0x84a1aa6f, 0x24faf63f, 0xa164b3d9, 0xc1bbb915, 0x7aae9724, 0xf386c0d2, 0x24e5d287};
-    static constexpr storage<limbs_count> omega_inv31=  {0x41a1b30c, 0xa70a7efd, 0x39f0e511, 0xc49c55a5, 0x033bb323, 0xab307a8f, 0x17acbd7f, 0x0158abd6};
-    static constexpr storage<limbs_count> omega_inv32=  {0x0f642025, 0x2c228b30, 0x01bd882b, 0xb0878e8d, 0xd7377fea, 0xd862b255, 0xf0490536, 0x18ac3666};
-    
+    static constexpr storage<limbs_count> omega_inv1 = {0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402,
+                                                        0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753};
+    static constexpr storage<limbs_count> omega_inv2 = {0x00000001, 0xfffeffff, 0x89fb5bfe, 0x67baa400,
+                                                        0x939ed334, 0xa5e80b39, 0x299d7d47, 0x73eda753};
+    static constexpr storage<limbs_count> omega_inv3 = {0xae99502e, 0x6037fe81, 0x94b04fd8, 0x8e749036,
+                                                        0xca86bf65, 0xbabc5aff, 0x5ce11044, 0x1333b22e};
+    static constexpr storage<limbs_count> omega_inv4 = {0x7dc08d74, 0x7f847ee4, 0x04eeaf5a, 0xbd433896,
+                                                        0x1832fc60, 0xd66c91d6, 0x607e449b, 0x551115b4};
+    static constexpr storage<limbs_count> omega_inv5 = {0x4e7773cb, 0xee5bcecc, 0xf6dab086, 0x45593d6f,
+                                                        0x4016e2bd, 0xa3a95d2d, 0xaf96816f, 0x047cb16c};
+    static constexpr storage<limbs_count> omega_inv6 = {0x982b68c5, 0xb891fa3f, 0x1d426b52, 0xa41e8501,
+                                                        0x882952d6, 0x566009b5, 0x7b3c79d6, 0x199cdaee};
+    static constexpr storage<limbs_count> omega_inv7 = {0xcf28601b, 0x571ba2fc, 0xac74db12, 0x166fb582,
+                                                        0x3501370b, 0x51420be4, 0x52f970ba, 0x1996fa8d};
+    static constexpr storage<limbs_count> omega_inv8 = {0x6a2f777a, 0xe9561c17, 0x2393991b, 0xc03cae03,
+                                                        0x5a5bfd4f, 0x91b00023, 0x272e58ee, 0x6d64ed25};
+    static constexpr storage<limbs_count> omega_inv9 = {0xf02a116e, 0xfb350dbe, 0xb4543a3e, 0x1c510ebf,
+                                                        0x37ad4eca, 0xf675522e, 0x80f82b2d, 0x1907a56e};
+    static constexpr storage<limbs_count> omega_inv10 = {0x4eb71aa6, 0xb0ad8003, 0xaa67e0be, 0x50a32c41,
+                                                         0x19141f44, 0x105f0672, 0xa3dad316, 0x2bcd9508};
+    static constexpr storage<limbs_count> omega_inv11 = {0x0f6fb2ac, 0x3dc9e560, 0x9aa58ff5, 0x3cc5bb32,
+                                                         0x36f376e1, 0xdeae67bc, 0x65ba213e, 0x394fda0d};
+    static constexpr storage<limbs_count> omega_inv12 = {0x60b82267, 0x09f239f7, 0x8b24f123, 0x14180e0e,
+                                                         0x45625d95, 0xad5a5340, 0x6d174692, 0x58c3ba63};
+    static constexpr storage<limbs_count> omega_inv13 = {0x348b416f, 0x0acf21c2, 0xbc086439, 0x798b6bf6,
+                                                         0xb1ca111d, 0x222d411f, 0x30ba1e0f, 0x044107b7};
+    static constexpr storage<limbs_count> omega_inv14 = {0x014abe84, 0xa3b861b8, 0x427ed008, 0x37c017e4,
+                                                         0xae0ff4f5, 0xae51f613, 0xcb1218d3, 0x1a2d00e1};
+    static constexpr storage<limbs_count> omega_inv15 = {0x4de7eb2b, 0x48aaa3bf, 0x6772057d, 0x4a58d54d,
+                                                         0x7093b551, 0xce25f16c, 0xd206337c, 0x242150ac};
+    static constexpr storage<limbs_count> omega_inv16 = {0x9ed57ae5, 0xdf3ec9ae, 0x7166577f, 0xea7df73a,
+                                                         0x022fbbe4, 0x6ca8d281, 0x151e3f6b, 0x5850c003};
+    static constexpr storage<limbs_count> omega_inv17 = {0x645e1cfa, 0x903a0a0c, 0x34788c37, 0xfbac54cb,
+                                                         0x8cf73d78, 0xdc127d11, 0x975d3c82, 0x6d0b5c7c};
+    static constexpr storage<limbs_count> omega_inv18 = {0x14b1ba04, 0xb49d6b05, 0xf00b84f2, 0x56e466b4,
+                                                         0x0b904f22, 0x30c390cf, 0x3ee254cc, 0x3e11cfb7};
+    static constexpr storage<limbs_count> omega_inv19 = {0xbe8201ab, 0x84dfa547, 0x530715d2, 0x3887ce8b,
+                                                         0x3eed4ed7, 0xa4c719c6, 0x8f8007b4, 0x18c44950};
+    static constexpr storage<limbs_count> omega_inv20 = {0x7d813cd1, 0xdaf0346d, 0xf755beb1, 0xeccf6f9a,
+                                                         0xe08143e3, 0x167fce38, 0x6f5d6dfa, 0x545ad9b2};
+    static constexpr storage<limbs_count> omega_inv21 = {0x577605de, 0x973f5466, 0x974f953c, 0x0ce8986e,
+                                                         0x074382f9, 0x8941cf4b, 0x6fa2672c, 0x156cd7f6};
+    static constexpr storage<limbs_count> omega_inv22 = {0x33b66141, 0x24315404, 0x1992f584, 0x5d1375ab,
+                                                         0x8b20ca1a, 0xf193ffa6, 0x2701a503, 0x47880cd5};
+    static constexpr storage<limbs_count> omega_inv23 = {0xe9f7b9af, 0xf7b6847d, 0x62c83ce2, 0x9a339673,
+                                                         0x6e5e6f79, 0xfabf4537, 0x35af33a3, 0x0975acd9};
+    static constexpr storage<limbs_count> omega_inv24 = {0x0eddd248, 0x4fb4204a, 0xc9e509b3, 0x8c98706a,
+                                                         0x2bb27eb1, 0xd0be8987, 0xc831438b, 0x6ec5f960};
+    static constexpr storage<limbs_count> omega_inv25 = {0x20238f62, 0xa13c95b7, 0x83b476b9, 0x130aa097,
+                                                         0x14860881, 0x758a04e0, 0x97066493, 0x58e2f8d6};
+    static constexpr storage<limbs_count> omega_inv26 = {0xe8bff41e, 0x65b09c73, 0x37f1c6a3, 0x8b3280e8,
+                                                         0x2846fb21, 0xe17b82ce, 0xb1ae27df, 0x476534bf};
+    static constexpr storage<limbs_count> omega_inv27 = {0xd5fdb757, 0x8480c0e7, 0x365bf9fd, 0x3644eea0,
+                                                         0xb776be86, 0x4ca116ca, 0x8b58390c, 0x17b6395f};
+    static constexpr storage<limbs_count> omega_inv28 = {0x252eb0db, 0x2c811e9a, 0x7479e161, 0x1b7d960d,
+                                                         0xb0a89a26, 0xb3afc7c1, 0x32b5e793, 0x6a2f9533};
+    static constexpr storage<limbs_count> omega_inv29 = {0x08b8a7ad, 0xe877b2c4, 0x341652b4, 0x68b0e8f0,
+                                                         0xe8b6a2d9, 0x2d44da3b, 0xfd09be59, 0x092778ff};
+    static constexpr storage<limbs_count> omega_inv30 = {0x7988f244, 0x84a1aa6f, 0x24faf63f, 0xa164b3d9,
+                                                         0xc1bbb915, 0x7aae9724, 0xf386c0d2, 0x24e5d287};
+    static constexpr storage<limbs_count> omega_inv31 = {0x41a1b30c, 0xa70a7efd, 0x39f0e511, 0xc49c55a5,
+                                                         0x033bb323, 0xab307a8f, 0x17acbd7f, 0x0158abd6};
+    static constexpr storage<limbs_count> omega_inv32 = {0x0f642025, 0x2c228b30, 0x01bd882b, 0xb0878e8d,
+                                                         0xd7377fea, 0xd862b255, 0xf0490536, 0x18ac3666};
+
    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
-        omega_inv1, omega_inv2, omega_inv3, omega_inv4, omega_inv5, omega_inv6, omega_inv7, omega_inv8, 
-        omega_inv9, omega_inv10, omega_inv11, omega_inv12, omega_inv13, omega_inv14, omega_inv15, omega_inv16,
-        omega_inv17, omega_inv18, omega_inv19, omega_inv20, omega_inv21, omega_inv22, omega_inv23, omega_inv24,
-        omega_inv25, omega_inv26, omega_inv27, omega_inv28, omega_inv29, omega_inv30, omega_inv31, omega_inv32,
+      omega_inv1,  omega_inv2,  omega_inv3,  omega_inv4,  omega_inv5,  omega_inv6,  omega_inv7,  omega_inv8,
+      omega_inv9,  omega_inv10, omega_inv11, omega_inv12, omega_inv13, omega_inv14, omega_inv15, omega_inv16,
+      omega_inv17, omega_inv18, omega_inv19, omega_inv20, omega_inv21, omega_inv22, omega_inv23, omega_inv24,
+      omega_inv25, omega_inv26, omega_inv27, omega_inv28, omega_inv29, omega_inv30, omega_inv31, omega_inv32,
    };
-    
+
    // Quick fix for linking issue
-    static constexpr storage<limbs_count> inv1=   {0x80000001, 0x7fffffff, 0x7fff2dff, 0xa9ded201, 0x04d0ec02, 0x199cec04, 0x94cebea4, 0x39f6d3a9};
-    static constexpr storage<limbs_count> inv2=   {0x40000001, 0x3fffffff, 0x3ffec4ff, 0xfece3b02, 0x07396203, 0x266b6206, 0x5f361df6, 0x56f23d7e};
-    static constexpr storage<limbs_count> inv3=   {0x20000001, 0x1fffffff, 0x9ffe907f, 0xa945ef82, 0x086d9d04, 0x2cd29d07, 0xc469cd9f, 0x656ff268};
-    static constexpr storage<limbs_count> inv4=   {0x10000001, 0x0fffffff, 0xcffe763f, 0xfe81c9c2, 0x8907ba84, 0xb0063a87, 0xf703a573, 0x6caeccdd};
-    static constexpr storage<limbs_count> inv5=   {0x08000001, 0x07ffffff, 0xe7fe691f, 0x291fb6e2, 0xc954c945, 0xf1a00947, 0x9050915d, 0x704e3a18};
-    static constexpr storage<limbs_count> inv6=   {0x04000001, 0x03ffffff, 0xf3fe628f, 0x3e6ead72, 0xe97b50a5, 0x126cf0a7, 0xdcf70753, 0x721df0b5};
-    static constexpr storage<limbs_count> inv7=   {0x02000001, 0x01ffffff, 0xf9fe5f47, 0x491628ba, 0xf98e9455, 0xa2d36457, 0x834a424d, 0x7305cc04};
-    static constexpr storage<limbs_count> inv8=   {0x01000001, 0x00ffffff, 0xfcfe5da3, 0x4e69e65e, 0x0198362d, 0xeb069e30, 0xd673dfca, 0x7379b9ab};
-    static constexpr storage<limbs_count> inv9=   {0x00800001, 0x007fffff, 0xfe7e5cd1, 0x5113c530, 0x059d0719, 0x8f203b1c, 0x8008ae89, 0x73b3b07f};
-    static constexpr storage<limbs_count> inv10=  {0x00400001, 0x003fffff, 0xff3e5c68, 0x5268b499, 0x079f6f8f, 0xe12d0992, 0x54d315e8, 0x73d0abe9};
-    static constexpr storage<limbs_count> inv11=  {0x00200001, 0x801fffff, 0x7f9e5c33, 0x53132c4e, 0x08a0a3ca, 0x8a3370cd, 0x3f384998, 0x73df299e};
-    static constexpr storage<limbs_count> inv12=  {0x00100001, 0x400fffff, 0xbfce5c19, 0xd3686828, 0x89213de7, 0x5eb6a46a, 0xb46ae370, 0x73e66878};
-    static constexpr storage<limbs_count> inv13=  {0x00080001, 0x2007ffff, 0xdfe65c0c, 0x93930615, 0x49618af6, 0x48f83e39, 0xef04305c, 0x73ea07e5};
-    static constexpr storage<limbs_count> inv14=  {0x00040001, 0x9003ffff, 0x6ff25c05, 0xf3a8550c, 0xa981b17d, 0x3e190b20, 0x8c50d6d2, 0x73ebd79c};
-    static constexpr storage<limbs_count> inv15=  {0x00020001, 0x4801ffff, 0xb7f85c02, 0xa3b2fc87, 0x5991c4c1, 0x38a97194, 0xdaf72a0d, 0x73ecbf77};
-    static constexpr storage<limbs_count> inv16=  {0x00010001, 0xa400ffff, 0x5bfb5c00, 0x7bb85045, 0x3199ce63, 0xb5f1a4ce, 0x824a53aa, 0x73ed3365};
-    static constexpr storage<limbs_count> inv17=  {0x00008001, 0xd2007fff, 0x2dfcdbff, 0x67bafa24, 0x1d9dd334, 0x7495be6b, 0x55f3e879, 0x73ed6d5c};
-    static constexpr storage<limbs_count> inv18=  {0x00004001, 0x69003fff, 0x96fd9bff, 0xddbc4f13, 0x939fd59c, 0xd3e7cb39, 0xbfc8b2e0, 0x73ed8a57};
-    static constexpr storage<limbs_count> inv19=  {0x00002001, 0x34801fff, 0x4b7dfbff, 0x18bcf98b, 0xcea0d6d1, 0x8390d1a0, 0x74b31814, 0x73ed98d5};
-    static constexpr storage<limbs_count> inv20=  {0x00001001, 0x1a400fff, 0x25be2bff, 0x363d4ec7, 0x6c21576b, 0x5b6554d4, 0x4f284aae, 0x73eda014};
-    static constexpr storage<limbs_count> inv21=  {0x00000801, 0x0d2007ff, 0x12de43ff, 0x44fd7965, 0x3ae197b8, 0x474f966e, 0xbc62e3fb, 0x73eda3b3};
-    static constexpr storage<limbs_count> inv22=  {0x00000401, 0x069003ff, 0x096e4fff, 0xcc5d8eb4, 0x2241b7de, 0xbd44b73b, 0x730030a1, 0x73eda583};
-    static constexpr storage<limbs_count> inv23=  {0x00000201, 0x034801ff, 0x84b655ff, 0x100d995b, 0x95f1c7f2, 0xf83f47a1, 0x4e4ed6f4, 0x73eda66b};
-    static constexpr storage<limbs_count> inv24=  {0x00000101, 0x01a400ff, 0x425a58ff, 0xb1e59eaf, 0xcfc9cffb, 0x95bc8fd4, 0x3bf62a1e, 0x73eda6df};
-    static constexpr storage<limbs_count> inv25=  {0x00000081, 0x00d2007f, 0x212c5a7f, 0x82d1a159, 0x6cb5d400, 0x647b33ee, 0x32c9d3b3, 0x73eda719};
-    static constexpr storage<limbs_count> inv26=  {0x00000041, 0x0069003f, 0x10955b3f, 0xeb47a2ae, 0x3b2bd602, 0xcbda85fb, 0x2e33a87d, 0x73eda736};
-    static constexpr storage<limbs_count> inv27=  {0x00000021, 0x0034801f, 0x8849db9f, 0x1f82a358, 0xa266d704, 0xff8a2f01, 0xabe892e2, 0x73eda744};
-    static constexpr storage<limbs_count> inv28=  {0x00000011, 0x001a400f, 0xc4241bcf, 0xb9a023ad, 0xd6045784, 0x99620384, 0xeac30815, 0x73eda74b};
-    static constexpr storage<limbs_count> inv29=  {0x00000009, 0x000d2007, 0x62113be7, 0x06aee3d8, 0x6fd317c5, 0xe64dedc6, 0x8a3042ae, 0x73eda74f};
-    static constexpr storage<limbs_count> inv30=  {0x00000005, 0x00069003, 0xb107cbf3, 0x2d3643ed, 0x3cba77e5, 0x8cc3e2e7, 0x59e6dffb, 0x73eda751};
-    static constexpr storage<limbs_count> inv31=  {0x00000003, 0x00034801, 0x588313f9, 0x4079f3f8, 0xa32e27f5, 0xdffedd77, 0x41c22ea1, 0x73eda752};
-    static constexpr storage<limbs_count> inv32=  {0x00000002, 0x0001a400, 0xac40b7fc, 0x4a1bcbfd, 0xd667fffd, 0x099c5abf, 0xb5afd5f5, 0x73eda752};
+    static constexpr storage<limbs_count> inv1 = {0x80000001, 0x7fffffff, 0x7fff2dff, 0xa9ded201,
+                                                  0x04d0ec02, 0x199cec04, 0x94cebea4, 0x39f6d3a9};
+    static constexpr storage<limbs_count> inv2 = {0x40000001, 0x3fffffff, 0x3ffec4ff, 0xfece3b02,
+                                                  0x07396203, 0x266b6206, 0x5f361df6, 0x56f23d7e};
+    static constexpr storage<limbs_count> inv3 = {0x20000001, 0x1fffffff, 0x9ffe907f, 0xa945ef82,
+                                                  0x086d9d04, 0x2cd29d07, 0xc469cd9f, 0x656ff268};
+    static constexpr storage<limbs_count> inv4 = {0x10000001, 0x0fffffff, 0xcffe763f, 0xfe81c9c2,
+                                                  0x8907ba84, 0xb0063a87, 0xf703a573, 0x6caeccdd};
+    static constexpr storage<limbs_count> inv5 = {0x08000001, 0x07ffffff, 0xe7fe691f, 0x291fb6e2,
+                                                  0xc954c945, 0xf1a00947, 0x9050915d, 0x704e3a18};
+    static constexpr storage<limbs_count> inv6 = {0x04000001, 0x03ffffff, 0xf3fe628f, 0x3e6ead72,
+                                                  0xe97b50a5, 0x126cf0a7, 0xdcf70753, 0x721df0b5};
+    static constexpr storage<limbs_count> inv7 = {0x02000001, 0x01ffffff, 0xf9fe5f47, 0x491628ba,
+                                                  0xf98e9455, 0xa2d36457, 0x834a424d, 0x7305cc04};
+    static constexpr storage<limbs_count> inv8 = {0x01000001, 0x00ffffff, 0xfcfe5da3, 0x4e69e65e,
+                                                  0x0198362d, 0xeb069e30, 0xd673dfca, 0x7379b9ab};
+    static constexpr storage<limbs_count> inv9 = {0x00800001, 0x007fffff, 0xfe7e5cd1, 0x5113c530,
+                                                  0x059d0719, 0x8f203b1c, 0x8008ae89, 0x73b3b07f};
+    static constexpr storage<limbs_count> inv10 = {0x00400001, 0x003fffff, 0xff3e5c68, 0x5268b499,
+                                                   0x079f6f8f, 0xe12d0992, 0x54d315e8, 0x73d0abe9};
+    static constexpr storage<limbs_count> inv11 = {0x00200001, 0x801fffff, 0x7f9e5c33, 0x53132c4e,
+                                                   0x08a0a3ca, 0x8a3370cd, 0x3f384998, 0x73df299e};
+    static constexpr storage<limbs_count> inv12 = {0x00100001, 0x400fffff, 0xbfce5c19, 0xd3686828,
+                                                   0x89213de7, 0x5eb6a46a, 0xb46ae370, 0x73e66878};
+    static constexpr storage<limbs_count> inv13 = {0x00080001, 0x2007ffff, 0xdfe65c0c, 0x93930615,
+                                                   0x49618af6, 0x48f83e39, 0xef04305c, 0x73ea07e5};
+    static constexpr storage<limbs_count> inv14 = {0x00040001, 0x9003ffff, 0x6ff25c05, 0xf3a8550c,
+                                                   0xa981b17d, 0x3e190b20, 0x8c50d6d2, 0x73ebd79c};
+    static constexpr storage<limbs_count> inv15 = {0x00020001, 0x4801ffff, 0xb7f85c02, 0xa3b2fc87,
+                                                   0x5991c4c1, 0x38a97194, 0xdaf72a0d, 0x73ecbf77};
+    static constexpr storage<limbs_count> inv16 = {0x00010001, 0xa400ffff, 0x5bfb5c00, 0x7bb85045,
+                                                   0x3199ce63, 0xb5f1a4ce, 0x824a53aa, 0x73ed3365};
+    static constexpr storage<limbs_count> inv17 = {0x00008001, 0xd2007fff, 0x2dfcdbff, 0x67bafa24,
+                                                   0x1d9dd334, 0x7495be6b, 0x55f3e879, 0x73ed6d5c};
+    static constexpr storage<limbs_count> inv18 = {0x00004001, 0x69003fff, 0x96fd9bff, 0xddbc4f13,
+                                                   0x939fd59c, 0xd3e7cb39, 0xbfc8b2e0, 0x73ed8a57};
+    static constexpr storage<limbs_count> inv19 = {0x00002001, 0x34801fff, 0x4b7dfbff, 0x18bcf98b,
+                                                   0xcea0d6d1, 0x8390d1a0, 0x74b31814, 0x73ed98d5};
+    static constexpr storage<limbs_count> inv20 = {0x00001001, 0x1a400fff, 0x25be2bff, 0x363d4ec7,
+                                                   0x6c21576b, 0x5b6554d4, 0x4f284aae, 0x73eda014};
+    static constexpr storage<limbs_count> inv21 = {0x00000801, 0x0d2007ff, 0x12de43ff, 0x44fd7965,
+                                                   0x3ae197b8, 0x474f966e, 0xbc62e3fb, 0x73eda3b3};
+    static constexpr storage<limbs_count> inv22 = {0x00000401, 0x069003ff, 0x096e4fff, 0xcc5d8eb4,
+                                                   0x2241b7de, 0xbd44b73b, 0x730030a1, 0x73eda583};
+    static constexpr storage<limbs_count> inv23 = {0x00000201, 0x034801ff, 0x84b655ff, 0x100d995b,
+                                                   0x95f1c7f2, 0xf83f47a1, 0x4e4ed6f4, 0x73eda66b};
+    static constexpr storage<limbs_count> inv24 = {0x00000101, 0x01a400ff, 0x425a58ff, 0xb1e59eaf,
+                                                   0xcfc9cffb, 0x95bc8fd4, 0x3bf62a1e, 0x73eda6df};
+    static constexpr storage<limbs_count> inv25 = {0x00000081, 0x00d2007f, 0x212c5a7f, 0x82d1a159,
+                                                   0x6cb5d400, 0x647b33ee, 0x32c9d3b3, 0x73eda719};
+    static constexpr storage<limbs_count> inv26 = {0x00000041, 0x0069003f, 0x10955b3f, 0xeb47a2ae,
+                                                   0x3b2bd602, 0xcbda85fb, 0x2e33a87d, 0x73eda736};
+    static constexpr storage<limbs_count> inv27 = {0x00000021, 0x0034801f, 0x8849db9f, 0x1f82a358,
+                                                   0xa266d704, 0xff8a2f01, 0xabe892e2, 0x73eda744};
+    static constexpr storage<limbs_count> inv28 = {0x00000011, 0x001a400f, 0xc4241bcf, 0xb9a023ad,
+                                                   0xd6045784, 0x99620384, 0xeac30815, 0x73eda74b};
+    static constexpr storage<limbs_count> inv29 = {0x00000009, 0x000d2007, 0x62113be7, 0x06aee3d8,
+                                                   0x6fd317c5, 0xe64dedc6, 0x8a3042ae, 0x73eda74f};
+    static constexpr storage<limbs_count> inv30 = {0x00000005, 0x00069003, 0xb107cbf3, 0x2d3643ed,
+                                                   0x3cba77e5, 0x8cc3e2e7, 0x59e6dffb, 0x73eda751};
+    static constexpr storage<limbs_count> inv31 = {0x00000003, 0x00034801, 0x588313f9, 0x4079f3f8,
+                                                   0xa32e27f5, 0xdffedd77, 0x41c22ea1, 0x73eda752};
+    static constexpr storage<limbs_count> inv32 = {0x00000002, 0x0001a400, 0xac40b7fc, 0x4a1bcbfd,
+                                                   0xd667fffd, 0x099c5abf, 0xb5afd5f5, 0x73eda752};

    static constexpr storage_array<omegas_count, limbs_count> inv = {
-        inv1, inv2, inv3, inv4, inv5, inv6, inv7, inv8, 
-        inv9, inv10, inv11, inv12, inv13, inv14, inv15, inv16,
-        inv17, inv18, inv19, inv20, inv21, inv22, inv23, inv24,
-        inv25, inv26, inv27, inv28, inv29, inv30, inv31, inv32,
-    }; 
-};
-  
+      inv1,  inv2,  inv3,  inv4,  inv5,  inv6,  inv7,  inv8,  inv9,  inv10, inv11, inv12, inv13, inv14, inv15, inv16,
+      inv17, inv18, inv19, inv20, inv21, inv22, inv23, inv24, inv25, inv26, inv27, inv28, inv29, inv30, inv31, inv32,
+    };
+  };
+
  struct fq_config {
    // field structure size = 12 * 32 bit
    static constexpr unsigned limbs_count = 12;
-    // modulus = 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787
-    static constexpr storage<limbs_count> modulus = {0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe, 0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84, 0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea};
-    // modulus*2 = 8004819110443334786835579651471808313113765639878015770664116272248063300981675728885375258258031328075788545119574
-    static constexpr storage<limbs_count> modulus_2 = {0xffff5556, 0x73fdffff, 0x62a7ffff, 0x3d57fffd, 0xed61ec48, 0xce61a541, 0xe70a257e, 0xc8ee9709, 0x869759ae, 0x96374f6c, 0x72ffcd34, 0x340223d4};
-    // modulus*4 = 16009638220886669573671159302943616626227531279756031541328232544496126601963351457770750516516062656151577090239148
-    static constexpr storage<limbs_count> modulus_4 = {0xfffeaaac, 0xe7fbffff, 0xc54ffffe, 0x7aaffffa, 0xdac3d890, 0x9cc34a83, 0xce144afd, 0x91dd2e13, 0xd2eb35d, 0x2c6e9ed9, 0xe5ff9a69, 0x680447a8};
-    
-    static constexpr storage<2*limbs_count> modulus_wide = {0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe, 0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84, 
-                                                            0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  
+    // modulus =
+    // 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787
+    static constexpr storage<limbs_count> modulus = {0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe,
+                                                     0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84,
+                                                     0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea};
+    // modulus*2 =
+    // 8004819110443334786835579651471808313113765639878015770664116272248063300981675728885375258258031328075788545119574
+    static constexpr storage<limbs_count> modulus_2 = {0xffff5556, 0x73fdffff, 0x62a7ffff, 0x3d57fffd,
+                                                       0xed61ec48, 0xce61a541, 0xe70a257e, 0xc8ee9709,
+                                                       0x869759ae, 0x96374f6c, 0x72ffcd34, 0x340223d4};
+    // modulus*4 =
+    // 16009638220886669573671159302943616626227531279756031541328232544496126601963351457770750516516062656151577090239148
+    static constexpr storage<limbs_count> modulus_4 = {0xfffeaaac, 0xe7fbffff, 0xc54ffffe, 0x7aaffffa,
+                                                       0xdac3d890, 0x9cc34a83, 0xce144afd, 0x91dd2e13,
+                                                       0xd2eb35d,  0x2c6e9ed9, 0xe5ff9a69, 0x680447a8};
+
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe, 0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84,
+      0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+
    // modulus^2
-    static constexpr storage<2*limbs_count> modulus_squared = {0x1c718e39, 0x26aa0000, 0x76382eab, 0x7ced6b1d, 0x62113cfd, 0x162c3383, 0x3e71b743, 0x66bf91ed, 
-                                                              0x7091a049, 0x292e85a8, 0x86185c7b, 0x1d68619c, 0x0978ef01, 0xf5314933, 0x16ddca6e, 0x50a62cfd, 
-                                                              0x349e8bd0, 0x66e59e49, 0x0e7046b4, 0xe2dc90e5, 0xa22f25e9, 0x4bd278ea, 0xb8c35fc7, 0x02a437a4};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0x1c718e39, 0x26aa0000, 0x76382eab, 0x7ced6b1d, 0x62113cfd, 0x162c3383, 0x3e71b743, 0x66bf91ed,
+      0x7091a049, 0x292e85a8, 0x86185c7b, 0x1d68619c, 0x0978ef01, 0xf5314933, 0x16ddca6e, 0x50a62cfd,
+      0x349e8bd0, 0x66e59e49, 0x0e7046b4, 0xe2dc90e5, 0xa22f25e9, 0x4bd278ea, 0xb8c35fc7, 0x02a437a4};
    // 2*modulus^2
-    static constexpr storage<2*limbs_count> modulus_squared_2 = {0x38e31c72, 0x4d540000, 0xec705d56, 0xf9dad63a, 0xc42279fa, 0x2c586706, 0x7ce36e86, 0xcd7f23da, 
-                                                                0xe1234092, 0x525d0b50, 0x0c30b8f6, 0x3ad0c339, 0x12f1de02, 0xea629266, 0x2dbb94dd, 0xa14c59fa, 
-                                                                0x693d17a0, 0xcdcb3c92, 0x1ce08d68, 0xc5b921ca, 0x445e4bd3, 0x97a4f1d5, 0x7186bf8e, 0x05486f49};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0x38e31c72, 0x4d540000, 0xec705d56, 0xf9dad63a, 0xc42279fa, 0x2c586706, 0x7ce36e86, 0xcd7f23da,
+      0xe1234092, 0x525d0b50, 0x0c30b8f6, 0x3ad0c339, 0x12f1de02, 0xea629266, 0x2dbb94dd, 0xa14c59fa,
+      0x693d17a0, 0xcdcb3c92, 0x1ce08d68, 0xc5b921ca, 0x445e4bd3, 0x97a4f1d5, 0x7186bf8e, 0x05486f49};
    // 4*modulus^2
-    static constexpr storage<2*limbs_count> modulus_squared_4 = {0x71c638e4, 0x9aa80000, 0xd8e0baac, 0xf3b5ac75, 0x8844f3f5, 0x58b0ce0d, 0xf9c6dd0c, 0x9afe47b4, 
-                                                                0xc2468125, 0xa4ba16a1, 0x186171ec, 0x75a18672, 0x25e3bc04, 0xd4c524cc, 0x5b7729bb, 0x4298b3f4, 
-                                                                0xd27a2f41, 0x9b967924, 0x39c11ad1, 0x8b724394, 0x88bc97a7, 0x2f49e3aa, 0xe30d7f1d, 0x0a90de92};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x71c638e4, 0x9aa80000, 0xd8e0baac, 0xf3b5ac75, 0x8844f3f5, 0x58b0ce0d, 0xf9c6dd0c, 0x9afe47b4,
+      0xc2468125, 0xa4ba16a1, 0x186171ec, 0x75a18672, 0x25e3bc04, 0xd4c524cc, 0x5b7729bb, 0x4298b3f4,
+      0xd27a2f41, 0x9b967924, 0x39c11ad1, 0x8b724394, 0x88bc97a7, 0x2f49e3aa, 0xe30d7f1d, 0x0a90de92};
    static constexpr unsigned modulus_bit_count = 381;
    // m = floor(2^(2*modulus_bit_count) / modulus)
-    static constexpr storage<limbs_count> m = {0xd59646e8, 0xec4f881f, 0x8163c701, 0x4e65c59e, 0x80a19de7, 0x2f7d1dc7, 0x7fda82a5, 0xa46e09d0, 0x331e9ae8, 0x38a0406c, 0xcf327917, 0x2760d74b};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0x0005555, 0x60100000, 0xeac00004, 0x15400014, 0x94f09dbe, 0x8cf2d5f0, 0xc7aed409, 0xb88b47b0, 0xcb453289, 0x4e45849b, 0x6801965b, 0x5feee15c};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x05c40fe, 0xaa212c9c, 0xccfd7e14, 0x70093ae9, 0xc85a96b4, 0x6d05c02d, 0x025fecd3, 0x1f193851, 0xeb48f4c6, 0x84d32f44, 0xed8ffb1a, 0xbefcc91e};
+    static constexpr storage<limbs_count> m = {0xd59646e8, 0xec4f881f, 0x8163c701, 0x4e65c59e, 0x80a19de7, 0x2f7d1dc7,
+                                               0x7fda82a5, 0xa46e09d0, 0x331e9ae8, 0x38a0406c, 0xcf327917, 0x2760d74b};
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0x0005555,  0x60100000, 0xeac00004, 0x15400014,
+                                                          0x94f09dbe, 0x8cf2d5f0, 0xc7aed409, 0xb88b47b0,
+                                                          0xcb453289, 0x4e45849b, 0x6801965b, 0x5feee15c};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x05c40fe,  0xaa212c9c, 0xccfd7e14, 0x70093ae9,
+                                                              0xc85a96b4, 0x6d05c02d, 0x025fecd3, 0x1f193851,
+                                                              0xeb48f4c6, 0x84d32f44, 0xed8ffb1a, 0xbefcc91e};
    // i^2, the square of the imaginary unit for the extension field
    static constexpr uint32_t i_squared = 1;
    // true if i^2 is negative
    static constexpr bool i_squared_is_negative = true;
-    // G1 and G2 generators 
-    static constexpr storage<limbs_count> g1_gen_x = {0xdb22c6bb, 0xfb3af00a, 0xf97a1aef, 0x6c55e83f, 0x171bac58, 0xa14e3a3f,
-                                                         0x9774b905, 0xc3688c4f, 0x4fa9ac0f, 0x2695638c, 0x3197d794, 0x17f1d3a7};
-    static constexpr storage<limbs_count> g1_gen_y = {0x46c5e7e1, 0x0caa2329, 0xa2888ae4, 0xd03cc744, 0x2c04b3ed, 0x00db18cb,
-                                                         0xd5d00af6, 0xfcf5e095, 0x741d8ae4, 0xa09e30ed, 0xe3aaa0f1, 0x08b3f481};
-    static constexpr storage<limbs_count> g2_gen_x_re = {0xc121bdb8, 0xd48056c8, 0xa805bbef, 0x0bac0326, 0x7ae3d177, 0xb4510b64,
-                                                            0xfa403b02, 0xc6e47ad4, 0x2dc51051, 0x26080527, 0xf08f0a91, 0x024aa2b2};
-    static constexpr storage<limbs_count> g2_gen_x_im = {0x5d042b7e, 0xe5ac7d05, 0x13945d57, 0x334cf112, 0xdc7f5049, 0xb5da61bb,
-                                                            0x9920b61a, 0x596bd0d0, 0x88274f65, 0x7dacd3a0, 0x52719f60, 0x13e02b60};
-    static constexpr storage<limbs_count> g2_gen_y_re = {0x08b82801, 0xe1935486, 0x3baca289, 0x923ac9cc, 0x5160d12c, 0x6d429a69,
-                                                            0x8cbdd3a7, 0xadfd9baa, 0xda2e351a, 0x8cc9cdc6, 0x727d6e11, 0x0ce5d527};
-    static constexpr storage<limbs_count> g2_gen_y_im = {0xf05f79be, 0xaaa9075f, 0x5cec1da1, 0x3f370d27, 0x572e99ab, 0x267492ab,
-                                                            0x85a763af, 0xcb3e287e, 0x2bc28b99, 0x32acd2b0, 0x2ea734cc, 0x0606c4a0};
+    // G1 and G2 generators
+    static constexpr storage<limbs_count> g1_gen_x = {0xdb22c6bb, 0xfb3af00a, 0xf97a1aef, 0x6c55e83f,
+                                                      0x171bac58, 0xa14e3a3f, 0x9774b905, 0xc3688c4f,
+                                                      0x4fa9ac0f, 0x2695638c, 0x3197d794, 0x17f1d3a7};
+    static constexpr storage<limbs_count> g1_gen_y = {0x46c5e7e1, 0x0caa2329, 0xa2888ae4, 0xd03cc744,
+                                                      0x2c04b3ed, 0x00db18cb, 0xd5d00af6, 0xfcf5e095,
+                                                      0x741d8ae4, 0xa09e30ed, 0xe3aaa0f1, 0x08b3f481};
+    static constexpr storage<limbs_count> g2_gen_x_re = {0xc121bdb8, 0xd48056c8, 0xa805bbef, 0x0bac0326,
+                                                         0x7ae3d177, 0xb4510b64, 0xfa403b02, 0xc6e47ad4,
+                                                         0x2dc51051, 0x26080527, 0xf08f0a91, 0x024aa2b2};
+    static constexpr storage<limbs_count> g2_gen_x_im = {0x5d042b7e, 0xe5ac7d05, 0x13945d57, 0x334cf112,
+                                                         0xdc7f5049, 0xb5da61bb, 0x9920b61a, 0x596bd0d0,
+                                                         0x88274f65, 0x7dacd3a0, 0x52719f60, 0x13e02b60};
+    static constexpr storage<limbs_count> g2_gen_y_re = {0x08b82801, 0xe1935486, 0x3baca289, 0x923ac9cc,
+                                                         0x5160d12c, 0x6d429a69, 0x8cbdd3a7, 0xadfd9baa,
+                                                         0xda2e351a, 0x8cc9cdc6, 0x727d6e11, 0x0ce5d527};
+    static constexpr storage<limbs_count> g2_gen_y_im = {0xf05f79be, 0xaaa9075f, 0x5cec1da1, 0x3f370d27,
+                                                         0x572e99ab, 0x267492ab, 0x85a763af, 0xcb3e287e,
+                                                         0x2bc28b99, 0x32acd2b0, 0x2ea734cc, 0x0606c4a0};
  };
-  
-  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                                          0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                                          0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-}
+
+  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000004, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
+    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
+    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+} // namespace PARAMS_BLS12_381
--- a/icicle/curves/bls12_381/poseidon.cu
+++ b/icicle/curves/bls12_381/poseidon.cu
@@ -1,23 +1,25 @@
 #ifndef _BLS12_381_POSEIDON
 #define _BLS12_381_POSEIDON
-#include <cuda.h>
-#include <stdexcept>
 #include "../../appUtils/poseidon/poseidon.cu"
 #include "curve_config.cuh"
+#include <cuda.h>
+#include <stdexcept>

 template class Poseidon<BLS12_381::scalar_t>;

-extern "C" int poseidon_multi_cuda_bls12_381(BLS12_381::scalar_t input[], BLS12_381::scalar_t* out,
-                                             size_t number_of_blocks, int arity, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int poseidon_multi_cuda_bls12_381(
+  BLS12_381::scalar_t input[],
+  BLS12_381::scalar_t* out,
+  size_t number_of_blocks,
+  int arity,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-  try
-  {
-    // TODO: once we get bindings to pass a stream, we should make {stream} a required parameter and use it instead of 
+  try {
+    // TODO: once we get bindings to pass a stream, we should make {stream} a required parameter and use it instead of
    // creating a new stream
-    if (stream == 0) {
-        cudaStreamCreate(&stream);
-    }
-    
+    if (stream == 0) { cudaStreamCreate(&stream); }
+
    cudaEvent_t start_event, end_event;
    cudaEventCreate(&start_event);
    cudaEventCreate(&end_event);
@@ -27,19 +29,17 @@ extern "C" int poseidon_multi_cuda_bls12_381(BLS12_381::scalar_t input[], BLS12_
    cudaEventRecord(end_event, stream);
    cudaEventSynchronize(end_event);

-    #ifdef DEBUG
+#ifdef DEBUG
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start_event, end_event);
    printf("Time elapsed: %f", elapsedTime);
-    #endif
+#endif

    cudaEventDestroy(start_event);
    cudaEventDestroy(end_event);

    return CUDA_SUCCESS;
-  }
-  catch (const std::runtime_error &ex)
-  {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what());
    return -1;
  }
--- a/icicle/curves/bls12_381/projective.cu
+++ b/icicle/curves/bls12_381/projective.cu
@@ -1,19 +1,23 @@
-#include <cuda.h>
-#include "curve_config.cuh"
 #include "../../primitives/projective.cuh"
+#include "curve_config.cuh"
+#include <cuda.h>

-extern "C" bool eq_bls12_381(BLS12_381::projective_t *point1, BLS12_381::projective_t *point2)
+extern "C" bool eq_bls12_381(BLS12_381::projective_t* point1, BLS12_381::projective_t* point2)
 {
-    return (*point1 == *point2) && 
-    !((point1->x == BLS12_381::point_field_t::zero()) && (point1->y == BLS12_381::point_field_t::zero()) && (point1->z == BLS12_381::point_field_t::zero())) && 
-    !((point2->x == BLS12_381::point_field_t::zero()) && (point2->y == BLS12_381::point_field_t::zero()) && (point2->z == BLS12_381::point_field_t::zero()));
+  return (*point1 == *point2) &&
+         !((point1->x == BLS12_381::point_field_t::zero()) && (point1->y == BLS12_381::point_field_t::zero()) &&
+           (point1->z == BLS12_381::point_field_t::zero())) &&
+         !((point2->x == BLS12_381::point_field_t::zero()) && (point2->y == BLS12_381::point_field_t::zero()) &&
+           (point2->z == BLS12_381::point_field_t::zero()));
 }

 #if defined(G2_DEFINED)
-extern "C" bool eq_g2_bls12_381(BLS12_381::g2_projective_t *point1, BLS12_381::g2_projective_t *point2)
+extern "C" bool eq_g2_bls12_381(BLS12_381::g2_projective_t* point1, BLS12_381::g2_projective_t* point2)
 {
-  return (*point1 == *point2) && 
-  !((point1->x == BLS12_381::g2_point_field_t::zero()) && (point1->y == BLS12_381::g2_point_field_t::zero()) && (point1->z == BLS12_381::g2_point_field_t::zero())) && 
-  !((point2->x == BLS12_381::g2_point_field_t::zero()) && (point2->y == BLS12_381::g2_point_field_t::zero()) && (point2->z == BLS12_381::g2_point_field_t::zero()));
+  return (*point1 == *point2) &&
+         !((point1->x == BLS12_381::g2_point_field_t::zero()) && (point1->y == BLS12_381::g2_point_field_t::zero()) &&
+           (point1->z == BLS12_381::g2_point_field_t::zero())) &&
+         !((point2->x == BLS12_381::g2_point_field_t::zero()) && (point2->y == BLS12_381::g2_point_field_t::zero()) &&
+           (point2->z == BLS12_381::g2_point_field_t::zero()));
 }
 #endif
--- a/icicle/curves/bls12_381/supported_operations.cu
+++ b/icicle/curves/bls12_381/supported_operations.cu
@@ -1,5 +1,5 @@
-#include "projective.cu"
 #include "lde.cu"
 #include "msm.cu"
-#include "ve_mod_mult.cu"
 #include "poseidon.cu"
+#include "projective.cu"
+#include "ve_mod_mult.cu"
--- a/icicle/curves/bls12_381/ve_mod_mult.cu
+++ b/icicle/curves/bls12_381/ve_mod_mult.cu
@@ -1,66 +1,60 @@
 #ifndef _BLS12_381_VEC_MULT
 #define _BLS12_381_VEC_MULT
-#include <stdio.h>
-#include <iostream>
-#include "../../primitives/field.cuh"
-#include "../../utils/storage.cuh"
-#include "../../primitives/projective.cuh"
-#include "curve_config.cuh"
 #include "../../appUtils/vector_manipulation/ve_mod_mult.cuh"
+#include "../../primitives/field.cuh"
+#include "../../primitives/projective.cuh"
+#include "../../utils/storage.cuh"
+#include "curve_config.cuh"
+#include <iostream>
+#include <stdio.h>

-extern "C" int32_t vec_mod_mult_point_bls12_381(BLS12_381::projective_t *inout,
-                                      BLS12_381::scalar_t *scalar_vec,
-                                      size_t n_elments,
-                                      size_t device_id,
-                                      cudaStream_t stream = 0)
+extern "C" int32_t vec_mod_mult_point_bls12_381(
+  BLS12_381::projective_t* inout,
+  BLS12_381::scalar_t* scalar_vec,
+  size_t n_elments,
+  size_t device_id,
+  cudaStream_t stream = 0)
 {
-  try
-  {
+  try {
    // TODO: device_id
    vector_mod_mult<BLS12_381::projective_t, BLS12_381::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
    return CUDA_SUCCESS;
-  }
-  catch (const std::runtime_error &ex)
-  {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
 }

-extern "C" int32_t vec_mod_mult_scalar_bls12_381(BLS12_381::scalar_t *inout,
-                                       BLS12_381::scalar_t *scalar_vec,
-                                       size_t n_elments,
-                                       size_t device_id,
-                                       cudaStream_t stream = 0)
+extern "C" int32_t vec_mod_mult_scalar_bls12_381(
+  BLS12_381::scalar_t* inout,
+  BLS12_381::scalar_t* scalar_vec,
+  size_t n_elments,
+  size_t device_id,
+  cudaStream_t stream = 0)
 {
-  try
-  {
+  try {
    // TODO: device_id
    vector_mod_mult<BLS12_381::scalar_t, BLS12_381::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
    return CUDA_SUCCESS;
-  }
-  catch (const std::runtime_error &ex)
-  {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
 }

-extern "C" int32_t matrix_vec_mod_mult_bls12_381(BLS12_381::scalar_t *matrix_flattened,
-                                       BLS12_381::scalar_t *input,
-                                       BLS12_381::scalar_t *output,
-                                       size_t n_elments,
-                                       size_t device_id,
-                                       cudaStream_t stream = 0)
+extern "C" int32_t matrix_vec_mod_mult_bls12_381(
+  BLS12_381::scalar_t* matrix_flattened,
+  BLS12_381::scalar_t* input,
+  BLS12_381::scalar_t* output,
+  size_t n_elments,
+  size_t device_id,
+  cudaStream_t stream = 0)
 {
-  try
-  {
+  try {
    // TODO: device_id
    matrix_mod_mult<BLS12_381::scalar_t>(matrix_flattened, input, output, n_elments, stream);
    return CUDA_SUCCESS;
-  }
-  catch (const std::runtime_error &ex)
-  {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
--- a/icicle/curves/bn254/curve_config.cuh
+++ b/icicle/curves/bn254/curve_config.cuh
@@ -9,17 +9,17 @@
 #include "params.cuh"

 namespace BN254 {
-    typedef Field<PARAMS_BN254::fp_config> scalar_field_t;
-    typedef scalar_field_t scalar_t;
-    typedef Field<PARAMS_BN254::fq_config> point_field_t;
-    static constexpr point_field_t b = point_field_t{ PARAMS_BN254::weierstrass_b };
-    typedef Projective<point_field_t, scalar_field_t, b> projective_t;
-    typedef Affine<point_field_t> affine_t;
-    #if defined(G2_DEFINED)
-    typedef ExtensionField<PARAMS_BN254::fq_config> g2_point_field_t;
-    static constexpr g2_point_field_t b_g2 = g2_point_field_t{ point_field_t{ PARAMS_BN254::weierstrass_b_g2_re },
-                                                               point_field_t{ PARAMS_BN254::weierstrass_b_g2_im }};
-    typedef Projective<g2_point_field_t, scalar_field_t, b_g2> g2_projective_t;
-    typedef Affine<g2_point_field_t> g2_affine_t;
-    #endif
-}
+  typedef Field<PARAMS_BN254::fp_config> scalar_field_t;
+  typedef scalar_field_t scalar_t;
+  typedef Field<PARAMS_BN254::fq_config> point_field_t;
+  static constexpr point_field_t b = point_field_t{PARAMS_BN254::weierstrass_b};
+  typedef Projective<point_field_t, scalar_field_t, b> projective_t;
+  typedef Affine<point_field_t> affine_t;
+#if defined(G2_DEFINED)
+  typedef ExtensionField<PARAMS_BN254::fq_config> g2_point_field_t;
+  static constexpr g2_point_field_t b_g2 = g2_point_field_t{
+    point_field_t{PARAMS_BN254::weierstrass_b_g2_re}, point_field_t{PARAMS_BN254::weierstrass_b_g2_im}};
+  typedef Projective<g2_point_field_t, scalar_field_t, b_g2> g2_projective_t;
+  typedef Affine<g2_point_field_t> g2_affine_t;
+#endif
+} // namespace BN254
--- a/icicle/curves/bn254/lde.cu
+++ b/icicle/curves/bn254/lde.cu
--- a/icicle/curves/bn254/msm.cu
+++ b/icicle/curves/bn254/msm.cu
@@ -1,186 +1,216 @@
 #ifndef _BN254_MSM
 #define _BN254_MSM
 #include "../../appUtils/msm/msm.cu"
-#include <stdexcept>
-#include <cuda.h>
 #include "curve_config.cuh"
+#include <cuda.h>
+#include <stdexcept>

-
-extern "C"
-int msm_cuda_bn254(BN254::projective_t *out, BN254::affine_t points[],
-              BN254::scalar_t scalars[], size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int msm_cuda_bn254(
+  BN254::projective_t* out,
+  BN254::affine_t points[],
+  BN254::scalar_t scalars[],
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {   
-        cudaStreamCreate(&stream);
-        large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(scalars, points, count, out, false, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(
+      scalars, points, count, out, false, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int msm_batch_cuda_bn254(BN254::projective_t* out, BN254::affine_t points[],
-                              BN254::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int msm_batch_cuda_bn254(
+  BN254::projective_t* out,
+  BN254::affine_t points[],
+  BN254::scalar_t scalars[],
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        batched_large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(
+      scalars, points, batch_size, msm_size, out, false, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

 /**
 * Commit to a polynomial using the MSM.
- * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
 * @param d_out Ouptut point to write the result to.
 * @param d_scalars Scalars for the MSM. Must be on device.
 * @param d_points Points for the MSM. Must be on device.
 * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
 */
-extern "C"
-int commit_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int commit_cuda_bn254(
+  BN254::projective_t* d_out,
+  BN254::scalar_t* d_scalars,
+  BN254::affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }
- 
+
 /**
 * Commit to a batch of polynomials using the MSM.
- * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
 * @param d_out Ouptut point to write the results to.
 * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
 * @param d_points Points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM.
 * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
 * @param batch_size Size of the batch.
 */
-extern "C"
-int commit_batch_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int commit_batch_cuda_bn254(
+  BN254::projective_t* d_out,
+  BN254::scalar_t* d_scalars,
+  BN254::affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

 #if defined(G2_DEFINED)
-extern "C"
-int msm_g2_cuda_bn254(BN254::g2_projective_t *out, BN254::g2_affine_t points[],
-              BN254::scalar_t scalars[], size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int msm_g2_cuda_bn254(
+  BN254::g2_projective_t* out,
+  BN254::g2_affine_t points[],
+  BN254::scalar_t scalars[],
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {   
-        cudaStreamCreate(&stream);
-        large_msm<BN254::scalar_t, BN254::g2_projective_t, BN254::g2_affine_t>(scalars, points, count, out, false, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    large_msm<BN254::scalar_t, BN254::g2_projective_t, BN254::g2_affine_t>(
+      scalars, points, count, out, false, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

-extern "C" int msm_batch_g2_cuda_bn254(BN254::g2_projective_t* out, BN254::g2_affine_t points[],
-                              BN254::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int msm_batch_g2_cuda_bn254(
+  BN254::g2_projective_t* out,
+  BN254::g2_affine_t points[],
+  BN254::scalar_t scalars[],
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    try
-    {
-        cudaStreamCreate(&stream);
-        batched_large_msm<BN254::scalar_t, BN254::g2_projective_t, BN254::g2_affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm<BN254::scalar_t, BN254::g2_projective_t, BN254::g2_affine_t>(
+      scalars, points, batch_size, msm_size, out, false, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }

 /**
 * Commit to a polynomial using the MSM in G2 group.
- * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
 * @param d_out Ouptut G2 point to write the result to.
 * @param d_scalars Scalars for the MSM. Must be on device.
 * @param d_points G2 affine points for the MSM. Must be on device.
 * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
 */
-extern "C"
-int commit_g2_cuda_bn254(BN254::g2_projective_t* d_out, BN254::scalar_t* d_scalars, BN254::g2_affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int commit_g2_cuda_bn254(
+  BN254::g2_projective_t* d_out,
+  BN254::scalar_t* d_scalars,
+  BN254::g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    // TODO: use device_id when working with multiple devices
-    (void)device_id;
-    try
-    {
-        cudaStreamCreate(&stream);
-        large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  // TODO: use device_id when working with multiple devices
+  (void)device_id;
+  try {
+    cudaStreamCreate(&stream);
+    large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }
- 
- /**
-  * Commit to a batch of polynomials using the MSM.
-  * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
-  * @param d_out Ouptut G2 point to write the results to.
-  * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
-  * @param d_points G2 affine points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM.
-  * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
-  * @param batch_size Size of the batch.
-  */
-extern "C"
-int commit_batch_g2_cuda_bn254(BN254::g2_projective_t* d_out, BN254::scalar_t* d_scalars, BN254::g2_affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+
+/**
+ * Commit to a batch of polynomials using the MSM.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
+ * @param d_out Ouptut G2 point to write the results to.
+ * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
+ * @param d_points G2 affine points for the MSMs. Must be on device. It is assumed that this set of bases is used for
+ * each MSM.
+ * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
+ * @param batch_size Size of the batch.
+ */
+extern "C" int commit_batch_g2_cuda_bn254(
+  BN254::g2_projective_t* d_out,
+  BN254::scalar_t* d_scalars,
+  BN254::g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
 {
-    // TODO: use device_id when working with multiple devices
-    (void)device_id;
-    try
-    {
-        cudaStreamCreate(&stream);
-        batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
+  // TODO: use device_id when working with multiple devices
+  (void)device_id;
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
 }
 #endif
 #endif
--- a/icicle/curves/bn254/params.cuh
+++ b/icicle/curves/bn254/params.cuh
@@ -6,147 +6,183 @@ namespace PARAMS_BN254 {
    static constexpr unsigned limbs_count = 8;
    static constexpr unsigned omegas_count = 28;
    static constexpr unsigned modulus_bit_count = 254;
-    
-    static constexpr storage<limbs_count> modulus = {0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
-    static constexpr storage<limbs_count> modulus_2 = {0xe0000002, 0x87c3eb27, 0xf372e122, 0x5067d090, 0x0302b0ba, 0x70a08b6d, 0xc2634053, 0x60c89ce5};
-    static constexpr storage<limbs_count> modulus_4 = {0xc0000004, 0x0f87d64f, 0xe6e5c245, 0xa0cfa121, 0x06056174, 0xe14116da, 0x84c680a6, 0xc19139cb};
-    static constexpr storage<2*limbs_count> modulus_wide = {0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2*limbs_count> modulus_squared = {0xe0000001, 0x08c3eb27, 0xdcb34000, 0xc7f26223, 0x68c9bb7f, 0xffe9a62c, 0xe821ddb0, 0xa6ce1975, 0x47b62fe7, 0x2c77527b, 0xd379d3df, 0x85f73bb0, 0x0348d21c, 0x599a6f7c, 0x763cbf9c, 0x0925c4b8};
-    static constexpr storage<2*limbs_count> modulus_squared_2 = {0xc0000002, 0x1187d64f, 0xb9668000, 0x8fe4c447, 0xd19376ff, 0xffd34c58, 0xd043bb61, 0x4d9c32eb, 0x8f6c5fcf, 0x58eea4f6, 0xa6f3a7be, 0x0bee7761, 0x0691a439, 0xb334def8, 0xec797f38, 0x124b8970};
-    static constexpr storage<2*limbs_count> modulus_squared_4 = {0x80000004, 0x230fac9f, 0x72cd0000, 0x1fc9888f, 0xa326edff, 0xffa698b1, 0xa08776c3, 0x9b3865d7, 0x1ed8bf9e, 0xb1dd49ed, 0x4de74f7c, 0x17dceec3, 0x0d234872, 0x6669bdf0, 0xd8f2fe71, 0x249712e1};

-    static constexpr storage<limbs_count> m = {0xbe1de925, 0x620703a6, 0x09e880ae, 0x71448520, 0x68073014, 0xab074a58, 0x623a04a7, 0x54a47462};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0x4ffffffb, 0xac96341c, 0x9f60cd29, 0x36fc7695, 0x7879462e, 0x666ea36f, 0x9a07df2f, 0xe0a77c1};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x6db1194e, 0xdc5ba005, 0xe111ec87, 0x90ef5a9, 0xaeb85d5d, 0xc8260de4, 0x82c5551c, 0x15ebf951};
+    static constexpr storage<limbs_count> modulus = {0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848,
+                                                     0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
+    static constexpr storage<limbs_count> modulus_2 = {0xe0000002, 0x87c3eb27, 0xf372e122, 0x5067d090,
+                                                       0x0302b0ba, 0x70a08b6d, 0xc2634053, 0x60c89ce5};
+    static constexpr storage<limbs_count> modulus_4 = {0xc0000004, 0x0f87d64f, 0xe6e5c245, 0xa0cfa121,
+                                                       0x06056174, 0xe14116da, 0x84c680a6, 0xc19139cb};
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0xe0000001, 0x08c3eb27, 0xdcb34000, 0xc7f26223, 0x68c9bb7f, 0xffe9a62c, 0xe821ddb0, 0xa6ce1975,
+      0x47b62fe7, 0x2c77527b, 0xd379d3df, 0x85f73bb0, 0x0348d21c, 0x599a6f7c, 0x763cbf9c, 0x0925c4b8};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0xc0000002, 0x1187d64f, 0xb9668000, 0x8fe4c447, 0xd19376ff, 0xffd34c58, 0xd043bb61, 0x4d9c32eb,
+      0x8f6c5fcf, 0x58eea4f6, 0xa6f3a7be, 0x0bee7761, 0x0691a439, 0xb334def8, 0xec797f38, 0x124b8970};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x80000004, 0x230fac9f, 0x72cd0000, 0x1fc9888f, 0xa326edff, 0xffa698b1, 0xa08776c3, 0x9b3865d7,
+      0x1ed8bf9e, 0xb1dd49ed, 0x4de74f7c, 0x17dceec3, 0x0d234872, 0x6669bdf0, 0xd8f2fe71, 0x249712e1};

-    static constexpr storage_array<omegas_count, limbs_count> omega = { {
-              {0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72},
-              {0x8f703636, 0x23120470, 0xfd736bec, 0x5cea24f6, 0x3fd84104, 0x048b6e19, 0xe131a029, 0x30644e72},
-              {0xc1bd5e80, 0x948dad4a, 0xf8170a0a, 0x52627366, 0x96afef36, 0xec9b9e2f, 0xc8c14f22, 0x2b337de1},
-              {0xe306460b, 0xb11509c6, 0x174efb98, 0x996dfbe1, 0x94dd508c, 0x1c6e4f45, 0x16cbbf4e, 0x21082ca2},
-              {0x3bb512d0, 0x3eed4c53, 0x838eeb1d, 0x9c18d51b, 0x47c0b2a9, 0x9678200d, 0x306b93d2, 0x09c532c6},
-              {0x118f023a, 0xdb94fb05, 0x26e324be, 0x46a6cb24, 0x49bdadf2, 0xc24cdb76, 0x5b080fca, 0x1418144d},
-              {0xba9d1811, 0x9d0e470c, 0xb6f24c79, 0x1dcb5564, 0xe85943e0, 0xdf5ce19c, 0xad310991, 0x16e73dfd},
-              {0x74a57a76, 0xc8936191, 0x6750f230, 0x61794254, 0x9f36ffb0, 0xf086204a, 0xa6148404, 0x07b0c561},
-              {0x470157ce, 0x893a7fa1, 0xfc782d75, 0xe8302a41, 0xdd9b0675, 0xffc02c0e, 0xf6e72f5b, 0x0f1ded1e},
-              {0xbc2e5912, 0x11f995e1, 0xa8d2d7ab, 0x39ba79c0, 0xb08771e3, 0xebbebc2b, 0x7017a420, 0x06fd19c1},
-              {0x769a2ee2, 0xd00a58f9, 0x7494f0ca, 0xb8c12c17, 0xa5355d71, 0xb4027fd7, 0x99c5042b, 0x027a3584},
-              {0x0042d43a, 0x1c477572, 0x6f039bb9, 0x76f169c7, 0xfd5a90a9, 0x01ddd073, 0xde2fd10f, 0x0931d596},
-              {0x9bbdd310, 0x4aa49b8d, 0x8e3a2d76, 0xd31bf3e2, 0x78b2667b, 0x001deac8, 0xb869ae62, 0x006fab49},
-              {0x617c6e85, 0xadaa01c2, 0x7420aae6, 0xb4a93ee1, 0x0ddca8a8, 0x1f4e51b8, 0xcdd9e481, 0x2d965651},
-              {0x4e26ecfb, 0xa93458fd, 0x4115a009, 0x022a2a2d, 0x69ec2bd0, 0x017171fa, 0x5941dc91, 0x2d1ba66f},
-              {0xdaac43b7, 0xd1628ba2, 0xe4347e7d, 0x16c8601d, 0xe081dcff, 0x649abebd, 0x5981ed45, 0x00eeb2cb},
-              {0xce8f58e5, 0x276e5858, 0x5655210e, 0x0512eca9, 0xe70e61f3, 0xc3708cc6, 0xa7d74902, 0x1bf82deb},
-              {0x7dcdc0e0, 0x84c6bfa5, 0x13f4d1bd, 0xc57088ff, 0xb5b95e4d, 0x5c0176fb, 0x3a8d46c1, 0x19ddbcaf},
-              {0x613f6cbd, 0x5c1d597f, 0x8357473a, 0x30525841, 0x968e4915, 0x51829353, 0x844bca52, 0x2260e724},
-              {0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x06d1e303, 0x27508aba, 0x0a0ed063, 0x26125da1},
-              {0xfcd0b523, 0xb2c87885, 0xca5a5ce3, 0x58f50577, 0x8598fc8c, 0x4222150e, 0xae2bdd1a, 0x1ded8980},
-              {0xa219447e, 0xa76dde56, 0x359eebbb, 0xec1a1f05, 0x8be08215, 0xcda0ceb6, 0xb1f8d9a7, 0x1ad92f46},
-              {0xab80c59d, 0xb54d4506, 0x22dd991f, 0x5680c640, 0xbc23a139, 0x6b7bcf70, 0x5ab4c74d, 0x0210fe63},
-              {0xe32b045b, 0x1c25f1e3, 0x2e832696, 0x145e0db8, 0x71c6441f, 0x852e2a03, 0x845d50d2, 0x0c9fabc7},
-              {0xb878331a, 0xeccd4f3e, 0x8dc6d26e, 0x7b26b748, 0xd9130cd4, 0xa19b0361, 0x326341ef, 0x2a734ebb},
-              {0x2f4e9212, 0x1c79bd57, 0x3d68f9ae, 0x605b52b6, 0xb8d89d4a, 0x0113eff9, 0xf1ff73b2, 0x1067569a},
-              {0x80928c44, 0x034afc45, 0xf6437da2, 0xb4823532, 0x6dc6e364, 0x5f256a9f, 0xb363ebe8, 0x049ae702},
-              {0x725b19f0, 0x9bd61b6e, 0x41112ed4, 0x402d111e, 0x8ef62abc, 0x00e0a7eb, 0xa58a7e85, 0x2a3c09f0}
-    } };
+    static constexpr storage<limbs_count> m = {0xbe1de925, 0x620703a6, 0x09e880ae, 0x71448520,
+                                               0x68073014, 0xab074a58, 0x623a04a7, 0x54a47462};
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0x4ffffffb, 0xac96341c, 0x9f60cd29, 0x36fc7695,
+                                                          0x7879462e, 0x666ea36f, 0x9a07df2f, 0xe0a77c1};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x6db1194e, 0xdc5ba005, 0xe111ec87, 0x90ef5a9,
+                                                              0xaeb85d5d, 0xc8260de4, 0x82c5551c, 0x15ebf951};

+    static constexpr storage_array<omegas_count, limbs_count> omega = {
+      {{0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72},
+       {0x8f703636, 0x23120470, 0xfd736bec, 0x5cea24f6, 0x3fd84104, 0x048b6e19, 0xe131a029, 0x30644e72},
+       {0xc1bd5e80, 0x948dad4a, 0xf8170a0a, 0x52627366, 0x96afef36, 0xec9b9e2f, 0xc8c14f22, 0x2b337de1},
+       {0xe306460b, 0xb11509c6, 0x174efb98, 0x996dfbe1, 0x94dd508c, 0x1c6e4f45, 0x16cbbf4e, 0x21082ca2},
+       {0x3bb512d0, 0x3eed4c53, 0x838eeb1d, 0x9c18d51b, 0x47c0b2a9, 0x9678200d, 0x306b93d2, 0x09c532c6},
+       {0x118f023a, 0xdb94fb05, 0x26e324be, 0x46a6cb24, 0x49bdadf2, 0xc24cdb76, 0x5b080fca, 0x1418144d},
+       {0xba9d1811, 0x9d0e470c, 0xb6f24c79, 0x1dcb5564, 0xe85943e0, 0xdf5ce19c, 0xad310991, 0x16e73dfd},
+       {0x74a57a76, 0xc8936191, 0x6750f230, 0x61794254, 0x9f36ffb0, 0xf086204a, 0xa6148404, 0x07b0c561},
+       {0x470157ce, 0x893a7fa1, 0xfc782d75, 0xe8302a41, 0xdd9b0675, 0xffc02c0e, 0xf6e72f5b, 0x0f1ded1e},
+       {0xbc2e5912, 0x11f995e1, 0xa8d2d7ab, 0x39ba79c0, 0xb08771e3, 0xebbebc2b, 0x7017a420, 0x06fd19c1},
+       {0x769a2ee2, 0xd00a58f9, 0x7494f0ca, 0xb8c12c17, 0xa5355d71, 0xb4027fd7, 0x99c5042b, 0x027a3584},
+       {0x0042d43a, 0x1c477572, 0x6f039bb9, 0x76f169c7, 0xfd5a90a9, 0x01ddd073, 0xde2fd10f, 0x0931d596},
+       {0x9bbdd310, 0x4aa49b8d, 0x8e3a2d76, 0xd31bf3e2, 0x78b2667b, 0x001deac8, 0xb869ae62, 0x006fab49},
+       {0x617c6e85, 0xadaa01c2, 0x7420aae6, 0xb4a93ee1, 0x0ddca8a8, 0x1f4e51b8, 0xcdd9e481, 0x2d965651},
+       {0x4e26ecfb, 0xa93458fd, 0x4115a009, 0x022a2a2d, 0x69ec2bd0, 0x017171fa, 0x5941dc91, 0x2d1ba66f},
+       {0xdaac43b7, 0xd1628ba2, 0xe4347e7d, 0x16c8601d, 0xe081dcff, 0x649abebd, 0x5981ed45, 0x00eeb2cb},
+       {0xce8f58e5, 0x276e5858, 0x5655210e, 0x0512eca9, 0xe70e61f3, 0xc3708cc6, 0xa7d74902, 0x1bf82deb},
+       {0x7dcdc0e0, 0x84c6bfa5, 0x13f4d1bd, 0xc57088ff, 0xb5b95e4d, 0x5c0176fb, 0x3a8d46c1, 0x19ddbcaf},
+       {0x613f6cbd, 0x5c1d597f, 0x8357473a, 0x30525841, 0x968e4915, 0x51829353, 0x844bca52, 0x2260e724},
+       {0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x06d1e303, 0x27508aba, 0x0a0ed063, 0x26125da1},
+       {0xfcd0b523, 0xb2c87885, 0xca5a5ce3, 0x58f50577, 0x8598fc8c, 0x4222150e, 0xae2bdd1a, 0x1ded8980},
+       {0xa219447e, 0xa76dde56, 0x359eebbb, 0xec1a1f05, 0x8be08215, 0xcda0ceb6, 0xb1f8d9a7, 0x1ad92f46},
+       {0xab80c59d, 0xb54d4506, 0x22dd991f, 0x5680c640, 0xbc23a139, 0x6b7bcf70, 0x5ab4c74d, 0x0210fe63},
+       {0xe32b045b, 0x1c25f1e3, 0x2e832696, 0x145e0db8, 0x71c6441f, 0x852e2a03, 0x845d50d2, 0x0c9fabc7},
+       {0xb878331a, 0xeccd4f3e, 0x8dc6d26e, 0x7b26b748, 0xd9130cd4, 0xa19b0361, 0x326341ef, 0x2a734ebb},
+       {0x2f4e9212, 0x1c79bd57, 0x3d68f9ae, 0x605b52b6, 0xb8d89d4a, 0x0113eff9, 0xf1ff73b2, 0x1067569a},
+       {0x80928c44, 0x034afc45, 0xf6437da2, 0xb4823532, 0x6dc6e364, 0x5f256a9f, 0xb363ebe8, 0x049ae702},
+       {0x725b19f0, 0x9bd61b6e, 0x41112ed4, 0x402d111e, 0x8ef62abc, 0x00e0a7eb, 0xa58a7e85, 0x2a3c09f0}}};

-    static constexpr storage_array<omegas_count, limbs_count> omega_inv = { {
-              {0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72},
-              {0x608fc9cb, 0x20cff123, 0x7c4604a5, 0xcb49c351, 0x41a91758, 0xb3c4d79d, 0x00000000, 0x00000000},
-              {0x07b95a9b, 0x8b11d9ab, 0x41671f56, 0x20710ead, 0x30f81dee, 0xfb3acaee, 0x9778465c, 0x130b1711},
-              {0x373428de, 0xb85a71e6, 0xaeb0337e, 0x74954d30, 0x303402b7, 0x2bfc85eb, 0x409556c0, 0x02e40daf},
-              {0xf210979d, 0x8c99980c, 0x34905b4d, 0xef8f3113, 0xdf25d8e7, 0x0aeaf3e7, 0x03bfbd79, 0x27247136},
-              {0x763d698f, 0x78ce6a0b, 0x1d3213ee, 0xd80396ec, 0x67a8a676, 0x035cdc75, 0xb2a13d3a, 0x26177cf2},
-              {0xc64427d7, 0xdddf985f, 0xa49e95bd, 0xaa4f964a, 0x5def8b04, 0x427c045f, 0x7969b732, 0x1641c053},
-              {0x0329f5d6, 0x692c553d, 0x8712848a, 0xa54cf8c6, 0x38e2b5e6, 0x64751ad9, 0x7422fad3, 0x204bd327},
-              {0xaf6b3e4e, 0x52f26c0f, 0xf0bcc0c8, 0x4c277a07, 0xe4fcfcab, 0x546875d5, 0xaa9995b3, 0x09d8f821},
-              {0xb2e5cc71, 0xcaa2e1e9, 0x6e43404e, 0xed42b68e, 0x7a2c7f0a, 0x6ed80915, 0xde3c86d6, 0x1c4042c7},
-              {0x579d71ae, 0x20a3a65d, 0x0adc4420, 0xfd7efed8, 0xfddabf54, 0x3bb6dcd7, 0xbc73d07b, 0x0fa9bb21},
-              {0xc79e0e57, 0xb6f70f8d, 0xa04e05ac, 0x269d3fde, 0x2ba088d9, 0xcf2e371c, 0x11b88d9c, 0x1af864d2},
-              {0xabd95dc9, 0x3b0b205a, 0x978188ca, 0xc8df74fa, 0x6a1cb6c8, 0x08e124db, 0xbfac6104, 0x1670ed58},
-              {0x641c8410, 0xf8eee934, 0x677771c0, 0xf40976b0, 0x558e6e8c, 0x11680d42, 0x06e7e9e9, 0x281c036f},
-              {0xb2dbc0b4, 0xc92a742f, 0x4d384e68, 0xc3f02842, 0x2fa43d0d, 0x22701b6f, 0xe4590b37, 0x05d33766},
-              {0x02d842d4, 0x922d5ac8, 0xc830e4c6, 0x91126414, 0x082f37e0, 0xe92338c0, 0x7fe704e8, 0x0b5d56b7},
-              {0xd96f0d22, 0x20e75251, 0x6bd4e8c9, 0xc01c7f08, 0xf9dd50c4, 0x37d8b00b, 0xc43ca872, 0x244cf010},
-              {0x66c5174c, 0x7a823174, 0x22d5ad70, 0x7dbe118c, 0x111119c5, 0xf8d7c71d, 0x83780e87, 0x036853f0},
-              {0xca535321, 0xd98f9924, 0xe66e6c81, 0x22dbc0ef, 0x664ae1b7, 0xa15cf806, 0xa314fb67, 0x06e402c0},
-              {0xe26c91f3, 0x0852a8fd, 0x3baca626, 0x521f45cb, 0x2c51bfca, 0xab6473bc, 0x2100895f, 0x100c332d},
-              {0xa376d0f0, 0xf5fac783, 0x940797d3, 0x50fd246e, 0x145f5278, 0xab14ecc1, 0x41091b14, 0x19c6dfb8},
-              {0x7faa1396, 0x43dc52e2, 0x4beced23, 0xd437be9d, 0x6d3c38c3, 0xecc11e9c, 0x0c74a876, 0x2eb58439},
-              {0xd69ca83b, 0x811b03e7, 0xa1a6eadf, 0x126a786b, 0x4e2b8e61, 0x1dd75c9f, 0xbda6792b, 0x2165a1a5},
-              {0x110b737b, 0x02e1d4d1, 0xb323a164, 0x7be1488d, 0x9cd06163, 0xa334d317, 0xdb50e9cd, 0x2710c370},
-              {0x9550fe47, 0x45d2f3cb, 0xf6a8efc4, 0x5f43327b, 0xe993ee18, 0x5bcd0d50, 0xb21de952, 0x27f035bd},
-              {0x232e3983, 0x1d63cbae, 0xaa1b58e2, 0xac815161, 0x6aeb019e, 0x531f42a5, 0x03ca2ef5, 0x2dcd51d9},
-              {0x980db869, 0xa8b64ba8, 0xc9718f6c, 0x4c787f72, 0x15d27ced, 0x7746a25a, 0x435a46e9, 0x110bf78f},
-              {0x9d18157e, 0x72394277, 0xfd399d5d, 0xec9d51f8, 0x49d5387f, 0x6117635d, 0x9c229cd5, 0x01b77519}
-    } };
-    
+    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
+      {{0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72},
+       {0x608fc9cb, 0x20cff123, 0x7c4604a5, 0xcb49c351, 0x41a91758, 0xb3c4d79d, 0x00000000, 0x00000000},
+       {0x07b95a9b, 0x8b11d9ab, 0x41671f56, 0x20710ead, 0x30f81dee, 0xfb3acaee, 0x9778465c, 0x130b1711},
+       {0x373428de, 0xb85a71e6, 0xaeb0337e, 0x74954d30, 0x303402b7, 0x2bfc85eb, 0x409556c0, 0x02e40daf},
+       {0xf210979d, 0x8c99980c, 0x34905b4d, 0xef8f3113, 0xdf25d8e7, 0x0aeaf3e7, 0x03bfbd79, 0x27247136},
+       {0x763d698f, 0x78ce6a0b, 0x1d3213ee, 0xd80396ec, 0x67a8a676, 0x035cdc75, 0xb2a13d3a, 0x26177cf2},
+       {0xc64427d7, 0xdddf985f, 0xa49e95bd, 0xaa4f964a, 0x5def8b04, 0x427c045f, 0x7969b732, 0x1641c053},
+       {0x0329f5d6, 0x692c553d, 0x8712848a, 0xa54cf8c6, 0x38e2b5e6, 0x64751ad9, 0x7422fad3, 0x204bd327},
+       {0xaf6b3e4e, 0x52f26c0f, 0xf0bcc0c8, 0x4c277a07, 0xe4fcfcab, 0x546875d5, 0xaa9995b3, 0x09d8f821},
+       {0xb2e5cc71, 0xcaa2e1e9, 0x6e43404e, 0xed42b68e, 0x7a2c7f0a, 0x6ed80915, 0xde3c86d6, 0x1c4042c7},
+       {0x579d71ae, 0x20a3a65d, 0x0adc4420, 0xfd7efed8, 0xfddabf54, 0x3bb6dcd7, 0xbc73d07b, 0x0fa9bb21},
+       {0xc79e0e57, 0xb6f70f8d, 0xa04e05ac, 0x269d3fde, 0x2ba088d9, 0xcf2e371c, 0x11b88d9c, 0x1af864d2},
+       {0xabd95dc9, 0x3b0b205a, 0x978188ca, 0xc8df74fa, 0x6a1cb6c8, 0x08e124db, 0xbfac6104, 0x1670ed58},
+       {0x641c8410, 0xf8eee934, 0x677771c0, 0xf40976b0, 0x558e6e8c, 0x11680d42, 0x06e7e9e9, 0x281c036f},
+       {0xb2dbc0b4, 0xc92a742f, 0x4d384e68, 0xc3f02842, 0x2fa43d0d, 0x22701b6f, 0xe4590b37, 0x05d33766},
+       {0x02d842d4, 0x922d5ac8, 0xc830e4c6, 0x91126414, 0x082f37e0, 0xe92338c0, 0x7fe704e8, 0x0b5d56b7},
+       {0xd96f0d22, 0x20e75251, 0x6bd4e8c9, 0xc01c7f08, 0xf9dd50c4, 0x37d8b00b, 0xc43ca872, 0x244cf010},
+       {0x66c5174c, 0x7a823174, 0x22d5ad70, 0x7dbe118c, 0x111119c5, 0xf8d7c71d, 0x83780e87, 0x036853f0},
+       {0xca535321, 0xd98f9924, 0xe66e6c81, 0x22dbc0ef, 0x664ae1b7, 0xa15cf806, 0xa314fb67, 0x06e402c0},
+       {0xe26c91f3, 0x0852a8fd, 0x3baca626, 0x521f45cb, 0x2c51bfca, 0xab6473bc, 0x2100895f, 0x100c332d},
+       {0xa376d0f0, 0xf5fac783, 0x940797d3, 0x50fd246e, 0x145f5278, 0xab14ecc1, 0x41091b14, 0x19c6dfb8},
+       {0x7faa1396, 0x43dc52e2, 0x4beced23, 0xd437be9d, 0x6d3c38c3, 0xecc11e9c, 0x0c74a876, 0x2eb58439},
+       {0xd69ca83b, 0x811b03e7, 0xa1a6eadf, 0x126a786b, 0x4e2b8e61, 0x1dd75c9f, 0xbda6792b, 0x2165a1a5},
+       {0x110b737b, 0x02e1d4d1, 0xb323a164, 0x7be1488d, 0x9cd06163, 0xa334d317, 0xdb50e9cd, 0x2710c370},
+       {0x9550fe47, 0x45d2f3cb, 0xf6a8efc4, 0x5f43327b, 0xe993ee18, 0x5bcd0d50, 0xb21de952, 0x27f035bd},
+       {0x232e3983, 0x1d63cbae, 0xaa1b58e2, 0xac815161, 0x6aeb019e, 0x531f42a5, 0x03ca2ef5, 0x2dcd51d9},
+       {0x980db869, 0xa8b64ba8, 0xc9718f6c, 0x4c787f72, 0x15d27ced, 0x7746a25a, 0x435a46e9, 0x110bf78f},
+       {0x9d18157e, 0x72394277, 0xfd399d5d, 0xec9d51f8, 0x49d5387f, 0x6117635d, 0x9c229cd5, 0x01b77519}}};

-    static constexpr storage_array<omegas_count, limbs_count> inv = { {
-              {0xf8000001, 0xa1f0fac9, 0x3cdcb848, 0x9419f424, 0x40c0ac2e, 0xdc2822db, 0x7098d014, 0x18322739},
-              {0xf4000001, 0xf2e9782e, 0x5b4b146c, 0xde26ee36, 0xe1210245, 0x4a3c3448, 0x28e5381f, 0x244b3ad6},
-              {0x72000001, 0x1b65b6e1, 0x6a82427f, 0x832d6b3f, 0xb1512d51, 0x81463cff, 0x850b6c24, 0x2a57c4a4},
-              {0xb1000001, 0x2fa3d63a, 0xf21dd988, 0x55b0a9c3, 0x196942d7, 0x1ccb415b, 0xb31e8627, 0x2d5e098b},
-              {0x50800001, 0xb9c2e5e7, 0x35eba50c, 0x3ef24906, 0xcd754d9a, 0x6a8dc388, 0x4a281328, 0x2ee12bff},
-              {0xa0400001, 0xfed26dbd, 0x57d28ace, 0xb39318a7, 0xa77b52fb, 0x116f049f, 0x15acd9a9, 0x2fa2bd39},
-              {0xc8200001, 0x215a31a8, 0xe8c5fdb0, 0x6de38077, 0x147e55ac, 0x64dfa52b, 0xfb6f3ce9, 0x300385d5},
-              {0x5c100001, 0xb29e139e, 0x313fb720, 0xcb0bb460, 0xcaffd704, 0x8e97f570, 0x6e506e89, 0x3033ea24},
-              {0x26080001, 0xfb400499, 0x557c93d8, 0xf99fce54, 0xa64097b0, 0xa3741d93, 0xa7c10759, 0x304c1c4b},
-              {0x8b040001, 0x1f90fd16, 0x679b0235, 0x10e9db4e, 0x13e0f807, 0xade231a5, 0x447953c1, 0x3058355f},
-              {0x3d820001, 0x31b97955, 0x70aa3963, 0x1c8ee1cb, 0xcab12832, 0xb3193bad, 0x12d579f5, 0x305e41e9},
-              {0x96c10001, 0x3acdb774, 0xf531d4fa, 0xa2616509, 0x26194047, 0xb5b4c0b2, 0xfa038d0f, 0x3061482d},
-              {0x43608001, 0xbf57d684, 0x3775a2c5, 0x654aa6a9, 0x53cd4c52, 0xb7028334, 0x6d9a969c, 0x3062cb50},
-              {0x19b04001, 0x819ce60c, 0xd89789ab, 0xc6bf4778, 0x6aa75257, 0x37a96475, 0xa7661b63, 0x30638ce1},
-              {0x04d82001, 0x62bf6dd0, 0xa9287d1e, 0x777997e0, 0xf614555a, 0x77fcd515, 0x444bddc6, 0x3063edaa},
-              {0xfa6c1001, 0xd350b1b1, 0x9170f6d7, 0xcfd6c014, 0x3bcad6db, 0x18268d66, 0x92bebef8, 0x30641e0e},
-              {0xf5360801, 0x8b9953a2, 0x859533b4, 0x7c05542e, 0x5ea6179c, 0xe83b698e, 0xb9f82f90, 0x30643640},
-              {0x729b0401, 0xe7bda49b, 0x7fa75222, 0xd21c9e3b, 0x7013b7fc, 0x5045d7a2, 0xcd94e7dd, 0x30644259},
-              {0xb14d8201, 0x15cfcd17, 0xfcb0615a, 0xfd284341, 0x78ca882c, 0x844b0eac, 0x57634403, 0x30644866},
-              {0xd0a6c101, 0xacd8e155, 0x3b34e8f5, 0x12ae15c5, 0x7d25f045, 0x9e4daa31, 0x9c4a7216, 0x30644b6c},
-              {0xe0536081, 0x785d6b74, 0xda772cc3, 0x1d70ff06, 0xff53a451, 0x2b4ef7f3, 0xbebe0920, 0x30644cef},
-              {0x6829b041, 0x5e1fb084, 0xaa184eaa, 0x22d273a7, 0x406a7e57, 0xf1cf9ed5, 0x4ff7d4a4, 0x30644db1},
-              {0x2c14d821, 0xd100d30c, 0x11e8df9d, 0x25832df8, 0xe0f5eb5a, 0x550ff245, 0x1894ba67, 0x30644e12},
-              {0x0e0a6c11, 0x8a716450, 0x45d12817, 0xa6db8b20, 0x313ba1db, 0x86b01bfe, 0x7ce32d48, 0x30644e42},
-              {0xff053609, 0x6729acf1, 0x5fc54c54, 0x6787b9b4, 0x595e7d1c, 0x1f8030da, 0xaf0a66b9, 0x30644e5a},
-              {0xf7829b05, 0xd585d142, 0x6cbf5e72, 0xc7ddd0fe, 0x6d6feabc, 0x6be83b48, 0xc81e0371, 0x30644e66},
-              {0x73c14d83, 0x0cb3e36b, 0x733c6782, 0xf808dca3, 0x7778a18c, 0x921c407f, 0xd4a7d1cd, 0x30644e6c},
-              {0xb1e0a6c2, 0xa84aec7f, 0xf67aec09, 0x101e6275, 0xfc7cfcf5, 0xa536431a, 0xdaecb8fb, 0x30644e6f}
-    } }; 
+    static constexpr storage_array<omegas_count, limbs_count> inv = {
+      {{0xf8000001, 0xa1f0fac9, 0x3cdcb848, 0x9419f424, 0x40c0ac2e, 0xdc2822db, 0x7098d014, 0x18322739},
+       {0xf4000001, 0xf2e9782e, 0x5b4b146c, 0xde26ee36, 0xe1210245, 0x4a3c3448, 0x28e5381f, 0x244b3ad6},
+       {0x72000001, 0x1b65b6e1, 0x6a82427f, 0x832d6b3f, 0xb1512d51, 0x81463cff, 0x850b6c24, 0x2a57c4a4},
+       {0xb1000001, 0x2fa3d63a, 0xf21dd988, 0x55b0a9c3, 0x196942d7, 0x1ccb415b, 0xb31e8627, 0x2d5e098b},
+       {0x50800001, 0xb9c2e5e7, 0x35eba50c, 0x3ef24906, 0xcd754d9a, 0x6a8dc388, 0x4a281328, 0x2ee12bff},
+       {0xa0400001, 0xfed26dbd, 0x57d28ace, 0xb39318a7, 0xa77b52fb, 0x116f049f, 0x15acd9a9, 0x2fa2bd39},
+       {0xc8200001, 0x215a31a8, 0xe8c5fdb0, 0x6de38077, 0x147e55ac, 0x64dfa52b, 0xfb6f3ce9, 0x300385d5},
+       {0x5c100001, 0xb29e139e, 0x313fb720, 0xcb0bb460, 0xcaffd704, 0x8e97f570, 0x6e506e89, 0x3033ea24},
+       {0x26080001, 0xfb400499, 0x557c93d8, 0xf99fce54, 0xa64097b0, 0xa3741d93, 0xa7c10759, 0x304c1c4b},
+       {0x8b040001, 0x1f90fd16, 0x679b0235, 0x10e9db4e, 0x13e0f807, 0xade231a5, 0x447953c1, 0x3058355f},
+       {0x3d820001, 0x31b97955, 0x70aa3963, 0x1c8ee1cb, 0xcab12832, 0xb3193bad, 0x12d579f5, 0x305e41e9},
+       {0x96c10001, 0x3acdb774, 0xf531d4fa, 0xa2616509, 0x26194047, 0xb5b4c0b2, 0xfa038d0f, 0x3061482d},
+       {0x43608001, 0xbf57d684, 0x3775a2c5, 0x654aa6a9, 0x53cd4c52, 0xb7028334, 0x6d9a969c, 0x3062cb50},
+       {0x19b04001, 0x819ce60c, 0xd89789ab, 0xc6bf4778, 0x6aa75257, 0x37a96475, 0xa7661b63, 0x30638ce1},
+       {0x04d82001, 0x62bf6dd0, 0xa9287d1e, 0x777997e0, 0xf614555a, 0x77fcd515, 0x444bddc6, 0x3063edaa},
+       {0xfa6c1001, 0xd350b1b1, 0x9170f6d7, 0xcfd6c014, 0x3bcad6db, 0x18268d66, 0x92bebef8, 0x30641e0e},
+       {0xf5360801, 0x8b9953a2, 0x859533b4, 0x7c05542e, 0x5ea6179c, 0xe83b698e, 0xb9f82f90, 0x30643640},
+       {0x729b0401, 0xe7bda49b, 0x7fa75222, 0xd21c9e3b, 0x7013b7fc, 0x5045d7a2, 0xcd94e7dd, 0x30644259},
+       {0xb14d8201, 0x15cfcd17, 0xfcb0615a, 0xfd284341, 0x78ca882c, 0x844b0eac, 0x57634403, 0x30644866},
+       {0xd0a6c101, 0xacd8e155, 0x3b34e8f5, 0x12ae15c5, 0x7d25f045, 0x9e4daa31, 0x9c4a7216, 0x30644b6c},
+       {0xe0536081, 0x785d6b74, 0xda772cc3, 0x1d70ff06, 0xff53a451, 0x2b4ef7f3, 0xbebe0920, 0x30644cef},
+       {0x6829b041, 0x5e1fb084, 0xaa184eaa, 0x22d273a7, 0x406a7e57, 0xf1cf9ed5, 0x4ff7d4a4, 0x30644db1},
+       {0x2c14d821, 0xd100d30c, 0x11e8df9d, 0x25832df8, 0xe0f5eb5a, 0x550ff245, 0x1894ba67, 0x30644e12},
+       {0x0e0a6c11, 0x8a716450, 0x45d12817, 0xa6db8b20, 0x313ba1db, 0x86b01bfe, 0x7ce32d48, 0x30644e42},
+       {0xff053609, 0x6729acf1, 0x5fc54c54, 0x6787b9b4, 0x595e7d1c, 0x1f8030da, 0xaf0a66b9, 0x30644e5a},
+       {0xf7829b05, 0xd585d142, 0x6cbf5e72, 0xc7ddd0fe, 0x6d6feabc, 0x6be83b48, 0xc81e0371, 0x30644e66},
+       {0x73c14d83, 0x0cb3e36b, 0x733c6782, 0xf808dca3, 0x7778a18c, 0x921c407f, 0xd4a7d1cd, 0x30644e6c},
+       {0xb1e0a6c2, 0xa84aec7f, 0xf67aec09, 0x101e6275, 0xfc7cfcf5, 0xa536431a, 0xdaecb8fb, 0x30644e6f}}};
  };

  struct fq_config {
    static constexpr unsigned limbs_count = 8;
    static constexpr unsigned modulus_bit_count = 254;
-    static constexpr storage<limbs_count> modulus = {0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
-    static constexpr storage<limbs_count> modulus_2 = {0xb0f9fa8e, 0x7841182d, 0xd0e3951a, 0x2f02d522, 0x0302b0bb, 0x70a08b6d, 0xc2634053, 0x60c89ce5};
-    static constexpr storage<limbs_count> modulus_4 = {0x61f3f51c, 0xf082305b, 0xa1c72a34, 0x5e05aa45, 0x06056176, 0xe14116da, 0x84c680a6, 0xc19139cb};
-    static constexpr storage<2*limbs_count> modulus_wide = {0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2*limbs_count> modulus_squared = {0x275d69b1, 0x3b5458a2, 0x09eac101, 0xa602072d, 0x6d96cadc, 0x4a50189c, 0x7a1242c8, 0x04689e95, 0x34c6b38d, 0x26edfa5c, 0x16375606, 0xb00b8551, 0x0348d21c, 0x599a6f7c, 0x763cbf9c, 0x0925c4b8};
-    static constexpr storage<2*limbs_count> modulus_squared_2 = {0x4ebad362, 0x76a8b144, 0x13d58202, 0x4c040e5a, 0xdb2d95b9, 0x94a03138, 0xf4248590, 0x08d13d2a, 0x698d671a, 0x4ddbf4b8, 0x2c6eac0c, 0x60170aa2, 0x0691a439, 0xb334def8, 0xec797f38, 0x124b8970};
-    static constexpr storage<2*limbs_count> modulus_squared_4 = {0x9d75a6c4, 0xed516288, 0x27ab0404, 0x98081cb4, 0xb65b2b72, 0x29406271, 0xe8490b21, 0x11a27a55, 0xd31ace34, 0x9bb7e970, 0x58dd5818, 0xc02e1544, 0x0d234872, 0x6669bdf0, 0xd8f2fe71, 0x249712e1};
-    static constexpr storage<limbs_count> m = {0x19bf90e5, 0x6f3aed8a, 0x67cd4c08, 0xae965e17, 0x68073013, 0xab074a58, 0x623a04a7, 0x54a47462};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0xa78eb28, 0x7879462c, 0x666ea36f, 0x9a07df2f, 0xe0a77c1};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x14afa37, 0xed84884a, 0x278edf8, 0xeb202285, 0xb74492d9, 0xcf63e9cf, 0x59e5c639, 0x2e671571};
+    static constexpr storage<limbs_count> modulus = {0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91,
+                                                     0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
+    static constexpr storage<limbs_count> modulus_2 = {0xb0f9fa8e, 0x7841182d, 0xd0e3951a, 0x2f02d522,
+                                                       0x0302b0bb, 0x70a08b6d, 0xc2634053, 0x60c89ce5};
+    static constexpr storage<limbs_count> modulus_4 = {0x61f3f51c, 0xf082305b, 0xa1c72a34, 0x5e05aa45,
+                                                       0x06056176, 0xe14116da, 0x84c680a6, 0xc19139cb};
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0x275d69b1, 0x3b5458a2, 0x09eac101, 0xa602072d, 0x6d96cadc, 0x4a50189c, 0x7a1242c8, 0x04689e95,
+      0x34c6b38d, 0x26edfa5c, 0x16375606, 0xb00b8551, 0x0348d21c, 0x599a6f7c, 0x763cbf9c, 0x0925c4b8};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0x4ebad362, 0x76a8b144, 0x13d58202, 0x4c040e5a, 0xdb2d95b9, 0x94a03138, 0xf4248590, 0x08d13d2a,
+      0x698d671a, 0x4ddbf4b8, 0x2c6eac0c, 0x60170aa2, 0x0691a439, 0xb334def8, 0xec797f38, 0x124b8970};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x9d75a6c4, 0xed516288, 0x27ab0404, 0x98081cb4, 0xb65b2b72, 0x29406271, 0xe8490b21, 0x11a27a55,
+      0xd31ace34, 0x9bb7e970, 0x58dd5818, 0xc02e1544, 0x0d234872, 0x6669bdf0, 0xd8f2fe71, 0x249712e1};
+    static constexpr storage<limbs_count> m = {0x19bf90e5, 0x6f3aed8a, 0x67cd4c08, 0xae965e17,
+                                               0x68073013, 0xab074a58, 0x623a04a7, 0x54a47462};
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0xa78eb28,
+                                                          0x7879462c, 0x666ea36f, 0x9a07df2f, 0xe0a77c1};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x14afa37,  0xed84884a, 0x278edf8,  0xeb202285,
+                                                              0xb74492d9, 0xcf63e9cf, 0x59e5c639, 0x2e671571};

    // i^2, the square of the imaginary unit for the extension field
    static constexpr uint32_t i_squared = 1;
    // true if i^2 is negative
    static constexpr bool i_squared_is_negative = true;
-    // G1 and G2 generators 
-    static constexpr storage<limbs_count> g1_gen_x = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> g1_gen_y = {0x00000002, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> g2_gen_x_re = {0xd992f6ed, 0x46debd5c, 0xf75edadd, 0x674322d4, 0x5e5c4479, 0x426a0066, 0x121f1e76, 0x1800deef};
-    static constexpr storage<limbs_count> g2_gen_x_im = {0xaef312c2, 0x97e485b7, 0x35a9e712, 0xf1aa4933, 0x31fb5d25, 0x7260bfb7, 0x920d483a, 0x198e9393};
-    static constexpr storage<limbs_count> g2_gen_y_re = {0x66fa7daa, 0x4ce6cc01, 0x0c43d37b, 0xe3d1e769, 0x8dcb408f, 0x4aab7180, 0xdb8c6deb, 0x12c85ea5};
-    static constexpr storage<limbs_count> g2_gen_y_im = {0xd122975b, 0x55acdadc, 0x70b38ef3, 0xbc4b3133, 0x690c3395, 0xec9e99ad, 0x585ff075, 0x090689d0};
+    // G1 and G2 generators
+    static constexpr storage<limbs_count> g1_gen_x = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                      0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> g1_gen_y = {0x00000002, 0x00000000, 0x00000000, 0x00000000,
+                                                      0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> g2_gen_x_re = {0xd992f6ed, 0x46debd5c, 0xf75edadd, 0x674322d4,
+                                                         0x5e5c4479, 0x426a0066, 0x121f1e76, 0x1800deef};
+    static constexpr storage<limbs_count> g2_gen_x_im = {0xaef312c2, 0x97e485b7, 0x35a9e712, 0xf1aa4933,
+                                                         0x31fb5d25, 0x7260bfb7, 0x920d483a, 0x198e9393};
+    static constexpr storage<limbs_count> g2_gen_y_re = {0x66fa7daa, 0x4ce6cc01, 0x0c43d37b, 0xe3d1e769,
+                                                         0x8dcb408f, 0x4aab7180, 0xdb8c6deb, 0x12c85ea5};
+    static constexpr storage<limbs_count> g2_gen_y_im = {0xd122975b, 0x55acdadc, 0x70b38ef3, 0xbc4b3133,
+                                                         0x690c3395, 0xec9e99ad, 0x585ff075, 0x090689d0};
  };

-  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000003, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {0x24a138e5, 0x3267e6dc, 0x59dbefa3, 0xb5b4c5e5, 0x1be06ac3, 0x81be1899, 0xceb8aaae, 0x2b149d40};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {0x85c315d2, 0xe4a2bd06, 0xe52d1852, 0xa74fa084, 0xeed8fdf4, 0xcd2cafad, 0x3af0fed4, 0x009713b0};
-}
+  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000003, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
+    0x24a138e5, 0x3267e6dc, 0x59dbefa3, 0xb5b4c5e5, 0x1be06ac3, 0x81be1899, 0xceb8aaae, 0x2b149d40};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
+    0x85c315d2, 0xe4a2bd06, 0xe52d1852, 0xa74fa084, 0xeed8fdf4, 0xcd2cafad, 0x3af0fed4, 0x009713b0};
+} // namespace PARAMS_BN254
--- a/icicle/curves/bn254/projective.cu
+++ b/icicle/curves/bn254/projective.cu
@@ -1,68 +1,60 @@
-#include <cuda.h>
-#include "curve_config.cuh"
 #include "../../primitives/projective.cuh"
+#include "curve_config.cuh"
+#include <cuda.h>

-extern "C" BN254::projective_t random_projective_bn254()
-{
-  return BN254::projective_t::rand_host();
-}
+extern "C" BN254::projective_t random_projective_bn254() { return BN254::projective_t::rand_host(); }

-extern "C" BN254::projective_t projective_zero_bn254()
-{
-  return BN254::projective_t::zero();
-}
+extern "C" BN254::projective_t projective_zero_bn254() { return BN254::projective_t::zero(); }

-extern "C" bool projective_is_on_curve_bn254(BN254::projective_t *point1)
+extern "C" bool projective_is_on_curve_bn254(BN254::projective_t* point1)
 {
  return BN254::projective_t::is_on_curve(*point1);
 }

-extern "C" BN254::affine_t projective_to_affine_bn254(BN254::projective_t *point1)
+extern "C" BN254::affine_t projective_to_affine_bn254(BN254::projective_t* point1)
 {
  return BN254::projective_t::to_affine(*point1);
 }

-extern "C" BN254::projective_t projective_from_affine_bn254(BN254::affine_t *point1)
+extern "C" BN254::projective_t projective_from_affine_bn254(BN254::affine_t* point1)
 {
  return BN254::projective_t::from_affine(*point1);
 }

-extern "C" BN254::scalar_field_t random_scalar_bn254()
-{
-  return BN254::scalar_field_t::rand_host();
-}
+extern "C" BN254::scalar_field_t random_scalar_bn254() { return BN254::scalar_field_t::rand_host(); }

-extern "C" bool eq_bn254(BN254::projective_t *point1, BN254::projective_t *point2)
+extern "C" bool eq_bn254(BN254::projective_t* point1, BN254::projective_t* point2)
 {
-  return (*point1 == *point2) && 
-  !((point1->x == BN254::point_field_t::zero()) && (point1->y == BN254::point_field_t::zero()) && (point1->z == BN254::point_field_t::zero())) && 
-  !((point2->x == BN254::point_field_t::zero()) && (point2->y == BN254::point_field_t::zero()) && (point2->z == BN254::point_field_t::zero()));
+  return (*point1 == *point2) &&
+         !((point1->x == BN254::point_field_t::zero()) && (point1->y == BN254::point_field_t::zero()) &&
+           (point1->z == BN254::point_field_t::zero())) &&
+         !((point2->x == BN254::point_field_t::zero()) && (point2->y == BN254::point_field_t::zero()) &&
+           (point2->z == BN254::point_field_t::zero()));
 }

 #if defined(G2_DEFINED)
-extern "C" bool eq_g2_bn254(BN254::g2_projective_t *point1, BN254::g2_projective_t *point2)
+extern "C" bool eq_g2_bn254(BN254::g2_projective_t* point1, BN254::g2_projective_t* point2)
 {
-  return (*point1 == *point2) && 
-  !((point1->x == BN254::g2_point_field_t::zero()) && (point1->y == BN254::g2_point_field_t::zero()) && (point1->z == BN254::g2_point_field_t::zero())) && 
-  !((point2->x == BN254::g2_point_field_t::zero()) && (point2->y == BN254::g2_point_field_t::zero()) && (point2->z == BN254::g2_point_field_t::zero()));
+  return (*point1 == *point2) &&
+         !((point1->x == BN254::g2_point_field_t::zero()) && (point1->y == BN254::g2_point_field_t::zero()) &&
+           (point1->z == BN254::g2_point_field_t::zero())) &&
+         !((point2->x == BN254::g2_point_field_t::zero()) && (point2->y == BN254::g2_point_field_t::zero()) &&
+           (point2->z == BN254::g2_point_field_t::zero()));
 }

-extern "C" BN254::g2_projective_t random_g2_projective_bn254()
-{
-  return BN254::g2_projective_t::rand_host();
-}
+extern "C" BN254::g2_projective_t random_g2_projective_bn254() { return BN254::g2_projective_t::rand_host(); }

-extern "C" BN254::g2_affine_t g2_projective_to_affine_bn254(BN254::g2_projective_t *point1)
+extern "C" BN254::g2_affine_t g2_projective_to_affine_bn254(BN254::g2_projective_t* point1)
 {
  return BN254::g2_projective_t::to_affine(*point1);
 }

-extern "C" BN254::g2_projective_t g2_projective_from_affine_bn254(BN254::g2_affine_t *point1)
+extern "C" BN254::g2_projective_t g2_projective_from_affine_bn254(BN254::g2_affine_t* point1)
 {
  return BN254::g2_projective_t::from_affine(*point1);
 }

-extern "C" bool g2_projective_is_on_curve_bn254(BN254::g2_projective_t *point1)
+extern "C" bool g2_projective_is_on_curve_bn254(BN254::g2_projective_t* point1)
 {
  return BN254::g2_projective_t::is_on_curve(*point1);
 }
--- a/icicle/curves/bn254/supported_operations.cu
+++ b/icicle/curves/bn254/supported_operations.cu
@@ -1,4 +1,4 @@
-#include "projective.cu"
 #include "lde.cu"
 #include "msm.cu"
+#include "projective.cu"
 #include "ve_mod_mult.cu"
--- a/icicle/curves/bn254/ve_mod_mult.cu
+++ b/icicle/curves/bn254/ve_mod_mult.cu
@@ -1,88 +1,70 @@
 #ifndef _BN254_VEC_MULT
 #define _BN254_VEC_MULT
-#include <stdio.h>
-#include <iostream>
-#include "../../primitives/field.cuh"
-#include "../../utils/storage.cuh"
-#include "../../primitives/projective.cuh"
-#include "curve_config.cuh"
 #include "../../appUtils/vector_manipulation/ve_mod_mult.cuh"
+#include "../../primitives/field.cuh"
+#include "../../primitives/projective.cuh"
+#include "../../utils/storage.cuh"
+#include "curve_config.cuh"
+#include <iostream>
+#include <stdio.h>

-
-extern "C" int32_t vec_mod_mult_point_bn254(BN254::projective_t *inout,
-                                      BN254::scalar_t *scalar_vec,
-                                      size_t n_elments,
-                                      size_t device_id,
-                                      cudaStream_t stream = 0)
+extern "C" int32_t vec_mod_mult_point_bn254(
+  BN254::projective_t* inout, BN254::scalar_t* scalar_vec, size_t n_elments, size_t device_id, cudaStream_t stream = 0)
 {
  // TODO: use device_id when working with multiple devices
  (void)device_id;
-  try
-  {
+  try {
    // TODO: device_id
    vector_mod_mult<BN254::projective_t, BN254::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
    return CUDA_SUCCESS;
-  }
-  catch (const std::runtime_error &ex)
-  {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
 }

-extern "C" int32_t vec_mod_mult_scalar_bn254(BN254::scalar_t *inout,
-                                       BN254::scalar_t *scalar_vec,
-                                       size_t n_elments,
-                                       size_t device_id,
-                                       cudaStream_t stream = 0)
+extern "C" int32_t vec_mod_mult_scalar_bn254(
+  BN254::scalar_t* inout, BN254::scalar_t* scalar_vec, size_t n_elments, size_t device_id, cudaStream_t stream = 0)
 {
  // TODO: use device_id when working with multiple devices
  (void)device_id;
-  try
-  {
+  try {
    // TODO: device_id
    vector_mod_mult<BN254::scalar_t, BN254::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
    return CUDA_SUCCESS;
-  }
-  catch (const std::runtime_error &ex)
-  {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
 }

 extern "C" int32_t vec_mod_mult_device_scalar_bn254(
-    BN254::scalar_t *inout,
-    BN254::scalar_t *scalar_vec,
-    size_t n_elements,
-    size_t device_id
-) {
+  BN254::scalar_t* inout, BN254::scalar_t* scalar_vec, size_t n_elements, size_t device_id)
+{
  try {
    vector_mod_mult_device<BN254::scalar_t, BN254::scalar_t>(scalar_vec, inout, inout, n_elements);
    return CUDA_SUCCESS;
-  } catch (const std::runtime_error &ex) {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
 }

-extern "C" int32_t matrix_vec_mod_mult_bn254(BN254::scalar_t *matrix_flattened,
-                                       BN254::scalar_t *input,
-                                       BN254::scalar_t *output,
-                                       size_t n_elments,
-                                       size_t device_id,
-                                       cudaStream_t stream = 0)
+extern "C" int32_t matrix_vec_mod_mult_bn254(
+  BN254::scalar_t* matrix_flattened,
+  BN254::scalar_t* input,
+  BN254::scalar_t* output,
+  size_t n_elments,
+  size_t device_id,
+  cudaStream_t stream = 0)
 {
  // TODO: use device_id when working with multiple devices
  (void)device_id;
-  try
-  {
+  try {
    // TODO: device_id
    matrix_mod_mult<BN254::scalar_t>(matrix_flattened, input, output, n_elments, stream);
    return CUDA_SUCCESS;
-  }
-  catch (const std::runtime_error &ex)
-  {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
--- a/icicle/curves/curve_template/curve_config.cuh
+++ b/icicle/curves/curve_template/curve_config.cuh
@@ -1,25 +0,0 @@
-#pragma once
-
-#include "../../primitives/field.cuh"
-#include "../../primitives/projective.cuh"
-#if defined(G2_DEFINED)
-#include "../../primitives/extension_field.cuh"
-#endif
-
-#include "params.cuh"
-
-namespace ${CURVE_NAME_U} {
-    typedef Field<PARAMS_${CURVE_NAME_U}::fp_config> scalar_field_t;
-    typedef scalar_field_t scalar_t;
-    typedef Field<PARAMS_${CURVE_NAME_U}::fq_config> point_field_t;
-    static constexpr point_field_t b = point_field_t{ PARAMS_${CURVE_NAME_U}::weierstrass_b };
-    typedef Projective<point_field_t, scalar_field_t, b> projective_t;
-    typedef Affine<point_field_t> affine_t;
-    #if defined(G2_DEFINED)
-    typedef ExtensionField<PARAMS_${CURVE_NAME_U}::fq_config> g2_point_field_t;
-    static constexpr g2_point_field_t b_g2 = g2_point_field_t{ point_field_t{ PARAMS_${CURVE_NAME_U}::weierstrass_b_g2_re },
-                                                               point_field_t{ PARAMS_${CURVE_NAME_U}::weierstrass_b_g2_im }};
-    typedef Projective<g2_point_field_t, scalar_field_t, b_g2> g2_projective_t;
-    typedef Affine<g2_point_field_t> g2_affine_t;
-    #endif
-}
--- a/icicle/curves/curve_template/curve_config.cuh.tmpl
+++ b/icicle/curves/curve_template/curve_config.cuh.tmpl
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "../../primitives/field.cuh"
+#include "../../primitives/projective.cuh"
+#if defined(G2_DEFINED)
+#include "../../primitives/extension_field.cuh"
+#endif
+
+#include "params.cuh"
+
+namespace ${CURVE_NAME_U} {
+  typedef Field<PARAMS_${CURVE_NAME_U}::fp_config> scalar_field_t;
+  typedef scalar_field_t scalar_t;
+  typedef Field<PARAMS_${CURVE_NAME_U}::fq_config> point_field_t;
+  static constexpr point_field_t b = point_field_t{PARAMS_${CURVE_NAME_U}::weierstrass_b};
+  typedef Projective<point_field_t, scalar_field_t, b> projective_t;
+  typedef Affine<point_field_t> affine_t;
+#if defined(G2_DEFINED)
+  typedef ExtensionField<PARAMS_${CURVE_NAME_U}::fq_config> g2_point_field_t;
+  static constexpr g2_point_field_t b_g2 = g2_point_field_t{ 
+    point_field_t{PARAMS_${CURVE_NAME_U}::weierstrass_b_g2_re}, point_field_t{PARAMS_${CURVE_NAME_U}::weierstrass_b_g2_im}};
+  typedef Projective<g2_point_field_t, scalar_field_t, b_g2> g2_projective_t;
+  typedef Affine<g2_point_field_t> g2_affine_t;
+#endif
+}
--- a/icicle/curves/curve_template/lde.cu
+++ b/icicle/curves/curve_template/lde.cu
@@ -1,567 +0,0 @@
-#ifndef _${CURVE_NAME_U}_LDE
-#define _${CURVE_NAME_U}_LDE
-#include <cuda.h>
-#include "../../appUtils/ntt/lde.cu"
-#include "../../appUtils/ntt/ntt.cuh"
-#include "../../appUtils/vector_manipulation/ve_mod_mult.cuh"
-#include "curve_config.cuh"
-#include "../../utils/mont.cuh"
-
-
-
-extern "C" ${CURVE_NAME_U}::scalar_t* build_domain_cuda_${CURVE_NAME_L}(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        if (inverse) {
-            return fill_twiddle_factors_array(domain_size, ${CURVE_NAME_U}::scalar_t::omega_inv(logn), stream);
-        } else {
-            return fill_twiddle_factors_array(domain_size, ${CURVE_NAME_U}::scalar_t::omega(logn), stream);
-        }
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return nullptr;
-    }
-}
-
-extern "C" int ntt_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return ntt_end2end_template<${CURVE_NAME_U}::scalar_t,${CURVE_NAME_U}::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        
-        return -1;        
-    }
-}
-
-extern "C" int ecntt_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return ntt_end2end_template<${CURVE_NAME_U}::projective_t,${CURVE_NAME_U}::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int ntt_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return ntt_end2end_batch_template<${CURVE_NAME_U}::scalar_t,${CURVE_NAME_U}::scalar_t>(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int ecntt_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return ntt_end2end_batch_template<${CURVE_NAME_U}::projective_t,${CURVE_NAME_U}::scalar_t>(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int interpolate_scalars_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t *d_evaluations, ${CURVE_NAME_U}::scalar_t *d_domain, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        ${CURVE_NAME_U}::scalar_t* _null = nullptr;
-        return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int interpolate_scalars_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t* d_evaluations, ${CURVE_NAME_U}::scalar_t* d_domain, unsigned n,
-                                              unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        ${CURVE_NAME_U}::scalar_t* _null = nullptr;
-        cudaStreamCreate(&stream);
-        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int interpolate_scalars_on_coset_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t *d_evaluations, ${CURVE_NAME_U}::scalar_t *d_domain, unsigned n, ${CURVE_NAME_U}::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        return interpolate(d_out, d_evaluations, d_domain, n, true, coset_powers, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int interpolate_scalars_batch_on_coset_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t* d_evaluations, ${CURVE_NAME_U}::scalar_t* d_domain, unsigned n,
-                                              unsigned batch_size, ${CURVE_NAME_U}::scalar_t* coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, true, coset_powers, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int interpolate_points_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* d_out, ${CURVE_NAME_U}::projective_t *d_evaluations, ${CURVE_NAME_U}::scalar_t *d_domain, unsigned n, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        ${CURVE_NAME_U}::scalar_t* _null = nullptr;
-        return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int interpolate_points_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* d_out, ${CURVE_NAME_U}::projective_t* d_evaluations, ${CURVE_NAME_U}::scalar_t* d_domain,
-                                             unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        ${CURVE_NAME_U}::scalar_t* _null = nullptr;
-        cudaStreamCreate(&stream);
-        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int evaluate_scalars_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t *d_coefficients, ${CURVE_NAME_U}::scalar_t *d_domain, 
-                                     unsigned domain_size, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        ${CURVE_NAME_U}::scalar_t* _null = nullptr;
-        cudaStreamCreate(&stream);
-        return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int evaluate_scalars_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t* d_coefficients, ${CURVE_NAME_U}::scalar_t* d_domain, unsigned domain_size,
-                                           unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        ${CURVE_NAME_U}::scalar_t* _null = nullptr;
-        cudaStreamCreate(&stream);
-        return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int evaluate_points_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* d_out, ${CURVE_NAME_U}::projective_t *d_coefficients, ${CURVE_NAME_U}::scalar_t *d_domain, 
-                                    unsigned domain_size, unsigned n, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        ${CURVE_NAME_U}::scalar_t* _null = nullptr;
-        cudaStreamCreate(&stream);
-        return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int evaluate_points_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* d_out, ${CURVE_NAME_U}::projective_t* d_coefficients, ${CURVE_NAME_U}::scalar_t* d_domain, unsigned domain_size,
-                                          unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        ${CURVE_NAME_U}::scalar_t* _null = nullptr;
-        cudaStreamCreate(&stream);
-        return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int evaluate_scalars_on_coset_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t *d_coefficients, ${CURVE_NAME_U}::scalar_t *d_domain, unsigned domain_size,
-                                              unsigned n, ${CURVE_NAME_U}::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int evaluate_scalars_on_coset_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t* d_coefficients, ${CURVE_NAME_U}::scalar_t* d_domain, unsigned domain_size, 
-                                                    unsigned n, unsigned batch_size, ${CURVE_NAME_U}::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int evaluate_points_on_coset_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* d_out, ${CURVE_NAME_U}::projective_t *d_coefficients, ${CURVE_NAME_U}::scalar_t *d_domain, unsigned domain_size,
-                                             unsigned n, ${CURVE_NAME_U}::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int evaluate_points_on_coset_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* d_out, ${CURVE_NAME_U}::projective_t* d_coefficients, ${CURVE_NAME_U}::scalar_t* d_domain, unsigned domain_size, 
-                                                   unsigned n, unsigned batch_size, ${CURVE_NAME_U}::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int ntt_inplace_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_inout, ${CURVE_NAME_U}::scalar_t* d_twiddles,
-                                           unsigned n, unsigned batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-
-        cudaStreamCreate(&stream);
-        ${CURVE_NAME_U}::scalar_t* _null = nullptr;
-        ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, false, _null, stream, true);
-        return CUDA_SUCCESS; //TODO: we should implement this https://leimao.github.io/blog/Proper-CUDA-Error-Checking/
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int ntt_inplace_coset_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_inout, ${CURVE_NAME_U}::scalar_t* d_twiddles,
-                                           unsigned n, unsigned batch_size, bool inverse, bool is_coset, ${CURVE_NAME_U}::scalar_t* coset, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, is_coset, coset, stream, true);
-        return CUDA_SUCCESS; //TODO: we should implement this https://leimao.github.io/blog/Proper-CUDA-Error-Checking/
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int sub_scalars_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t* d_in1, ${CURVE_NAME_U}::scalar_t* d_in2, unsigned n, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return sub_polys(d_out, d_in1, d_in2, n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int add_scalars_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t* d_in1, ${CURVE_NAME_U}::scalar_t* d_in2, unsigned n, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return add_polys(d_out, d_in1, d_in2, n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int to_montgomery_scalars_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_inout, unsigned n, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return to_montgomery(d_inout, n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int from_montgomery_scalars_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_inout, unsigned n, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return from_montgomery(d_inout, n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int to_montgomery_proj_points_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return to_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 3 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int from_montgomery_proj_points_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return from_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 3 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int to_montgomery_aff_points_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return to_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 2 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int from_montgomery_aff_points_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return from_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 2 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-#if defined(G2_DEFINED)
-extern "C" int to_montgomery_proj_points_g2_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return to_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 6 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int from_montgomery_proj_points_g2_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return from_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 6 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int to_montgomery_aff_points_g2_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return to_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 4 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int from_montgomery_aff_points_g2_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        return from_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 4 * n, stream);
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-#endif
-
-extern "C" int reverse_order_scalars_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        uint32_t logn = uint32_t(log(n) / log(2));
-        cudaStreamCreate(&stream);
-        reverse_order(arr, n, logn, stream);
-        cudaStreamSynchronize(stream);
-        return 0;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int reverse_order_scalars_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        uint32_t logn = uint32_t(log(n) / log(2));
-        cudaStreamCreate(&stream);
-        reverse_order_batch(arr, n, logn, batch_size, stream);
-        return 0;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int reverse_order_points_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        uint32_t logn = uint32_t(log(n) / log(2));
-        cudaStreamCreate(&stream);
-        reverse_order(arr, n, logn, stream);
-        return 0;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int reverse_order_points_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        uint32_t logn = uint32_t(log(n) / log(2));
-        cudaStreamCreate(&stream);
-        reverse_order_batch(arr, n, logn, batch_size, stream);
-        return 0;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-#endif
--- a/icicle/curves/curve_template/lde.cu.tmpl
+++ b/icicle/curves/curve_template/lde.cu.tmpl
@@ -0,0 +1,592 @@
+#ifndef _${CURVE_NAME_U}_LDE
+#define _${CURVE_NAME_U}_LDE
+#include "../../appUtils/ntt/lde.cu"
+#include "../../appUtils/ntt/ntt.cuh"
+#include "../../appUtils/vector_manipulation/ve_mod_mult.cuh"
+#include "../../utils/mont.cuh"
+#include "curve_config.cuh"
+#include <cuda.h>
+
+extern "C" ${CURVE_NAME_U}::scalar_t* build_domain_cuda_${CURVE_NAME_L}(
+  uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    if (inverse) {
+      return fill_twiddle_factors_array(domain_size, ${CURVE_NAME_U}::scalar_t::omega_inv(logn), stream);
+    } else {
+      return fill_twiddle_factors_array(domain_size, ${CURVE_NAME_U}::scalar_t::omega(logn), stream);
+    }
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return nullptr;
+  }
+}
+
+extern "C" int ntt_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return ntt_end2end_template<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+
+    return -1;
+  }
+}
+
+extern "C" int ecntt_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::projective_t* arr,
+  uint32_t n,
+  bool inverse,
+  Decimation decimation,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return ntt_end2end_template<${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int ntt_batch_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* arr,
+  uint32_t arr_size,
+  uint32_t batch_size,
+  bool inverse,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return ntt_end2end_batch_template<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::scalar_t>(
+      arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int ecntt_batch_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::projective_t* arr,
+  uint32_t arr_size,
+  uint32_t batch_size,
+  bool inverse,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return ntt_end2end_batch_template<${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::scalar_t>(
+      arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int interpolate_scalars_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* d_out,
+  ${CURVE_NAME_U}::scalar_t* d_evaluations,
+  ${CURVE_NAME_U}::scalar_t* d_domain,
+  unsigned n,
+  unsigned device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    ${CURVE_NAME_U}::scalar_t* _null = nullptr;
+    return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int interpolate_scalars_batch_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* d_out,
+  ${CURVE_NAME_U}::scalar_t* d_evaluations,
+  ${CURVE_NAME_U}::scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    ${CURVE_NAME_U}::scalar_t* _null = nullptr;
+    cudaStreamCreate(&stream);
+    return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int interpolate_scalars_on_coset_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* d_out,
+  ${CURVE_NAME_U}::scalar_t* d_evaluations,
+  ${CURVE_NAME_U}::scalar_t* d_domain,
+  unsigned n,
+  ${CURVE_NAME_U}::scalar_t* coset_powers,
+  unsigned device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    return interpolate(d_out, d_evaluations, d_domain, n, true, coset_powers, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int interpolate_scalars_batch_on_coset_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* d_out,
+  ${CURVE_NAME_U}::scalar_t* d_evaluations,
+  ${CURVE_NAME_U}::scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  ${CURVE_NAME_U}::scalar_t* coset_powers,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, true, coset_powers, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int interpolate_points_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::projective_t* d_out,
+  ${CURVE_NAME_U}::projective_t* d_evaluations,
+  ${CURVE_NAME_U}::scalar_t* d_domain,
+  unsigned n,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    ${CURVE_NAME_U}::scalar_t* _null = nullptr;
+    return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int interpolate_points_batch_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::projective_t* d_out,
+  ${CURVE_NAME_U}::projective_t* d_evaluations,
+  ${CURVE_NAME_U}::scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    ${CURVE_NAME_U}::scalar_t* _null = nullptr;
+    cudaStreamCreate(&stream);
+    return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int evaluate_scalars_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* d_out,
+  ${CURVE_NAME_U}::scalar_t* d_coefficients,
+  ${CURVE_NAME_U}::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    ${CURVE_NAME_U}::scalar_t* _null = nullptr;
+    cudaStreamCreate(&stream);
+    return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int evaluate_scalars_batch_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* d_out,
+  ${CURVE_NAME_U}::scalar_t* d_coefficients,
+  ${CURVE_NAME_U}::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    ${CURVE_NAME_U}::scalar_t* _null = nullptr;
+    cudaStreamCreate(&stream);
+    return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int evaluate_points_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::projective_t* d_out,
+  ${CURVE_NAME_U}::projective_t* d_coefficients,
+  ${CURVE_NAME_U}::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    ${CURVE_NAME_U}::scalar_t* _null = nullptr;
+    cudaStreamCreate(&stream);
+    return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int evaluate_points_batch_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::projective_t* d_out,
+  ${CURVE_NAME_U}::projective_t* d_coefficients,
+  ${CURVE_NAME_U}::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    ${CURVE_NAME_U}::scalar_t* _null = nullptr;
+    cudaStreamCreate(&stream);
+    return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int evaluate_scalars_on_coset_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* d_out,
+  ${CURVE_NAME_U}::scalar_t* d_coefficients,
+  ${CURVE_NAME_U}::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  ${CURVE_NAME_U}::scalar_t* coset_powers,
+  unsigned device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int evaluate_scalars_on_coset_batch_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* d_out,
+  ${CURVE_NAME_U}::scalar_t* d_coefficients,
+  ${CURVE_NAME_U}::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  ${CURVE_NAME_U}::scalar_t* coset_powers,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int evaluate_points_on_coset_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::projective_t* d_out,
+  ${CURVE_NAME_U}::projective_t* d_coefficients,
+  ${CURVE_NAME_U}::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  ${CURVE_NAME_U}::scalar_t* coset_powers,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int evaluate_points_on_coset_batch_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::projective_t* d_out,
+  ${CURVE_NAME_U}::projective_t* d_coefficients,
+  ${CURVE_NAME_U}::scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  ${CURVE_NAME_U}::scalar_t* coset_powers,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int ntt_inplace_batch_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* d_inout,
+  ${CURVE_NAME_U}::scalar_t* d_twiddles,
+  unsigned n,
+  unsigned batch_size,
+  bool inverse,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    ${CURVE_NAME_U}::scalar_t* _null = nullptr;
+    ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, false, _null, stream, true);
+    return CUDA_SUCCESS; // TODO: we should implement this https://leimao.github.io/blog/Proper-CUDA-Error-Checking/
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int ntt_inplace_coset_batch_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* d_inout,
+  ${CURVE_NAME_U}::scalar_t* d_twiddles,
+  unsigned n,
+  unsigned batch_size,
+  bool inverse,
+  bool is_coset,
+  ${CURVE_NAME_U}::scalar_t* coset,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, is_coset, coset, stream, true);
+    return CUDA_SUCCESS; // TODO: we should implement this https://leimao.github.io/blog/Proper-CUDA-Error-Checking/
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int sub_scalars_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t* d_in1, ${CURVE_NAME_U}::scalar_t* d_in2, unsigned n, cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return sub_polys(d_out, d_in1, d_in2, n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int add_scalars_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t* d_in1, ${CURVE_NAME_U}::scalar_t* d_in2, unsigned n, cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return add_polys(d_out, d_in1, d_in2, n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int to_montgomery_scalars_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return to_montgomery(d_inout, n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int from_montgomery_scalars_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return from_montgomery(d_inout, n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int to_montgomery_proj_points_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return to_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 3 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int from_montgomery_proj_points_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return from_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 3 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int to_montgomery_aff_points_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return to_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 2 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int from_montgomery_aff_points_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return from_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 2 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+#if defined(G2_DEFINED)
+extern "C" int
+to_montgomery_proj_points_g2_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return to_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 6 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int
+from_montgomery_proj_points_g2_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return from_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 6 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int to_montgomery_aff_points_g2_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return to_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 4 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int
+from_montgomery_aff_points_g2_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    return from_montgomery((${CURVE_NAME_U}::point_field_t*)d_inout, 4 * n, stream);
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+#endif
+
+extern "C" int
+reverse_order_scalars_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
+{
+  try {
+    uint32_t logn = uint32_t(log(n) / log(2));
+    cudaStreamCreate(&stream);
+    reverse_order(arr, n, logn, stream);
+    cudaStreamSynchronize(stream);
+    return 0;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int reverse_order_scalars_batch_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+{
+  try {
+    uint32_t logn = uint32_t(log(n) / log(2));
+    cudaStreamCreate(&stream);
+    reverse_order_batch(arr, n, logn, batch_size, stream);
+    return 0;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int
+reverse_order_points_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
+{
+  try {
+    uint32_t logn = uint32_t(log(n) / log(2));
+    cudaStreamCreate(&stream);
+    reverse_order(arr, n, logn, stream);
+    return 0;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int reverse_order_points_batch_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::projective_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+{
+  try {
+    uint32_t logn = uint32_t(log(n) / log(2));
+    cudaStreamCreate(&stream);
+    reverse_order_batch(arr, n, logn, batch_size, stream);
+    return 0;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+#endif
--- a/icicle/curves/curve_template/msm.cu
+++ b/icicle/curves/curve_template/msm.cu
@@ -1,186 +0,0 @@
-#ifndef _${CURVE_NAME_U}_MSM
-#define _${CURVE_NAME_U}_MSM
-#include "../../appUtils/msm/msm.cu"
-#include <stdexcept>
-#include <cuda.h>
-#include "curve_config.cuh"
-
-
-extern "C"
-int msm_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *out, ${CURVE_NAME_U}::affine_t points[],
-              ${CURVE_NAME_U}::scalar_t scalars[], size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {   
-        cudaStreamCreate(&stream);
-        large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::affine_t>(scalars, points, count, out, false, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int msm_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* out, ${CURVE_NAME_U}::affine_t points[],
-                              ${CURVE_NAME_U}::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        batched_large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-/**
- * Commit to a polynomial using the MSM.
- * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
- * @param d_out Ouptut point to write the result to.
- * @param d_scalars Scalars for the MSM. Must be on device.
- * @param d_points Points for the MSM. Must be on device.
- * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
- */
-extern "C"
-int commit_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* d_out, ${CURVE_NAME_U}::scalar_t* d_scalars, ${CURVE_NAME_U}::affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
- 
-/**
- * Commit to a batch of polynomials using the MSM.
- * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
- * @param d_out Ouptut point to write the results to.
- * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
- * @param d_points Points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM.
- * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
- * @param batch_size Size of the batch.
- */
-extern "C"
-int commit_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* d_out, ${CURVE_NAME_U}::scalar_t* d_scalars, ${CURVE_NAME_U}::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-#if defined(G2_DEFINED)
-extern "C"
-int msm_g2_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t *out, ${CURVE_NAME_U}::g2_affine_t points[],
-              ${CURVE_NAME_U}::scalar_t scalars[], size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {   
-        cudaStreamCreate(&stream);
-        large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::g2_projective_t, ${CURVE_NAME_U}::g2_affine_t>(scalars, points, count, out, false, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-extern "C" int msm_batch_g2_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t* out, ${CURVE_NAME_U}::g2_affine_t points[],
-                              ${CURVE_NAME_U}::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    try
-    {
-        cudaStreamCreate(&stream);
-        batched_large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::g2_projective_t, ${CURVE_NAME_U}::g2_affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-
-/**
- * Commit to a polynomial using the MSM in G2 group.
- * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
- * @param d_out Ouptut G2 point to write the result to.
- * @param d_scalars Scalars for the MSM. Must be on device.
- * @param d_points G2 affine points for the MSM. Must be on device.
- * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
- */
-extern "C"
-int commit_g2_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t* d_out, ${CURVE_NAME_U}::scalar_t* d_scalars, ${CURVE_NAME_U}::g2_affine_t* d_points, size_t count, unsigned large_bucket_factor, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    // TODO: use device_id when working with multiple devices
-    (void)device_id;
-    try
-    {
-        cudaStreamCreate(&stream);
-        large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
- 
- /**
-  * Commit to a batch of polynomials using the MSM.
-  * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
-  * @param d_out Ouptut G2 point to write the results to.
-  * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
-  * @param d_points G2 affine points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM.
-  * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
-  * @param batch_size Size of the batch.
-  */
-extern "C"
-int commit_batch_g2_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t* d_out, ${CURVE_NAME_U}::scalar_t* d_scalars, ${CURVE_NAME_U}::g2_affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
-{
-    // TODO: use device_id when working with multiple devices
-    (void)device_id;
-    try
-    {
-        cudaStreamCreate(&stream);
-        batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
-        cudaStreamSynchronize(stream);
-        return CUDA_SUCCESS;
-    }
-    catch (const std::runtime_error &ex)
-    {
-        printf("error %s", ex.what());
-        return -1;
-    }
-}
-#endif
-#endif
--- a/icicle/curves/curve_template/msm.cu.tmpl
+++ b/icicle/curves/curve_template/msm.cu.tmpl
@@ -0,0 +1,216 @@
+#ifndef _${CURVE_NAME_U}_MSM
+#define _${CURVE_NAME_U}_MSM
+#include "../../appUtils/msm/msm.cu"
+#include "curve_config.cuh"
+#include <cuda.h>
+#include <stdexcept>
+
+extern "C" int msm_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::projective_t* out,
+  ${CURVE_NAME_U}::affine_t points[],
+  ${CURVE_NAME_U}::scalar_t scalars[],
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::affine_t>(
+      scalars, points, count, out, false, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int msm_batch_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::projective_t* out,
+  ${CURVE_NAME_U}::affine_t points[],
+  ${CURVE_NAME_U}::scalar_t scalars[],
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::affine_t>(
+      scalars, points, batch_size, msm_size, out, false, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+/**
+ * Commit to a polynomial using the MSM.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
+ * @param d_out Ouptut point to write the result to.
+ * @param d_scalars Scalars for the MSM. Must be on device.
+ * @param d_points Points for the MSM. Must be on device.
+ * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
+ */
+extern "C" int commit_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::projective_t* d_out,
+  ${CURVE_NAME_U}::scalar_t* d_scalars,
+  ${CURVE_NAME_U}::affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+/**
+ * Commit to a batch of polynomials using the MSM.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
+ * @param d_out Ouptut point to write the results to.
+ * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
+ * @param d_points Points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM.
+ * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
+ * @param batch_size Size of the batch.
+ */
+extern "C" int commit_batch_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::projective_t* d_out,
+  ${CURVE_NAME_U}::scalar_t* d_scalars,
+  ${CURVE_NAME_U}::affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+#if defined(G2_DEFINED)
+extern "C" int msm_g2_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::g2_projective_t* out,
+  ${CURVE_NAME_U}::g2_affine_t points[],
+  ${CURVE_NAME_U}::scalar_t scalars[],
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::g2_projective_t, ${CURVE_NAME_U}::g2_affine_t>(
+      scalars, points, count, out, false, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+extern "C" int msm_batch_g2_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::g2_projective_t* out,
+  ${CURVE_NAME_U}::g2_affine_t points[],
+  ${CURVE_NAME_U}::scalar_t scalars[],
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::g2_projective_t, ${CURVE_NAME_U}::g2_affine_t>(
+      scalars, points, batch_size, msm_size, out, false, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+/**
+ * Commit to a polynomial using the MSM in G2 group.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
+ * @param d_out Ouptut G2 point to write the result to.
+ * @param d_scalars Scalars for the MSM. Must be on device.
+ * @param d_points G2 affine points for the MSM. Must be on device.
+ * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
+ */
+extern "C" int commit_g2_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::g2_projective_t* d_out,
+  ${CURVE_NAME_U}::scalar_t* d_scalars,
+  ${CURVE_NAME_U}::g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  // TODO: use device_id when working with multiple devices
+  (void)device_id;
+  try {
+    cudaStreamCreate(&stream);
+    large_msm(d_scalars, d_points, count, d_out, true, false, large_bucket_factor, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+
+/**
+ * Commit to a batch of polynomials using the MSM.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or
+ * points.
+ * @param d_out Ouptut G2 point to write the results to.
+ * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
+ * @param d_points G2 affine points for the MSMs. Must be on device. It is assumed that this set of bases is used for
+ * each MSM.
+ * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
+ * @param batch_size Size of the batch.
+ */
+extern "C" int commit_batch_g2_cuda_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::g2_projective_t* d_out,
+  ${CURVE_NAME_U}::scalar_t* d_scalars,
+  ${CURVE_NAME_U}::g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id = 0,
+  cudaStream_t stream = 0)
+{
+  // TODO: use device_id when working with multiple devices
+  (void)device_id;
+  try {
+    cudaStreamCreate(&stream);
+    batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
+    cudaStreamSynchronize(stream);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error& ex) {
+    printf("error %s", ex.what());
+    return -1;
+  }
+}
+#endif
+#endif
--- a/icicle/curves/curve_template/params.cuh.tmpl
+++ b/icicle/curves/curve_template/params.cuh.tmpl
@@ -6,7 +6,7 @@ namespace PARAMS_${curve_name_U} {
    static constexpr unsigned limbs_count = ${fp_num_limbs};
    static constexpr unsigned omegas_count = ${num_omegas};
    static constexpr unsigned modulus_bit_count = ${fp_modulus_bit_count};
-    
+
    static constexpr storage<limbs_count> modulus = {${fp_modulus}};
    static constexpr storage<limbs_count> modulus_2 = {${fp_modulus_2}};
    static constexpr storage<limbs_count> modulus_4 = {${fp_modulus_4}};
--- a/icicle/curves/curve_template/projective.cu
+++ b/icicle/curves/curve_template/projective.cu
@@ -1,70 +0,0 @@
-#include <cuda.h>
-#include "curve_config.cuh"
-#include "../../primitives/projective.cuh"
-
-extern "C" ${CURVE_NAME_U}::projective_t random_projective_${CURVE_NAME_L}()
-{
-  return ${CURVE_NAME_U}::projective_t::rand_host();
-}
-
-extern "C" ${CURVE_NAME_U}::projective_t projective_zero_${CURVE_NAME_L}()
-{
-  return ${CURVE_NAME_U}::projective_t::zero();
-}
-
-extern "C" bool projective_is_on_curve_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *point1)
-{
-  return ${CURVE_NAME_U}::projective_t::is_on_curve(*point1);
-}
-
-extern "C" ${CURVE_NAME_U}::affine_t projective_to_affine_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *point1)
-{
-  return ${CURVE_NAME_U}::projective_t::to_affine(*point1);
-}
-
-extern "C" ${CURVE_NAME_U}::projective_t projective_from_affine_${CURVE_NAME_L}(${CURVE_NAME_U}::affine_t *point1)
-{
-  return ${CURVE_NAME_U}::projective_t::from_affine(*point1);
-}
-
-extern "C" ${CURVE_NAME_U}::scalar_field_t random_scalar_${CURVE_NAME_L}()
-{
-  return ${CURVE_NAME_U}::scalar_field_t::rand_host();
-}
-
-extern "C" bool eq_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *point1, ${CURVE_NAME_U}::projective_t *point2)
-{
-  return (*point1 == *point2) && 
-  !((point1->x == ${CURVE_NAME_U}::point_field_t::zero()) && (point1->y == ${CURVE_NAME_U}::point_field_t::zero()) && (point1->z == ${CURVE_NAME_U}::point_field_t::zero())) && 
-  !((point2->x == ${CURVE_NAME_U}::point_field_t::zero()) && (point2->y == ${CURVE_NAME_U}::point_field_t::zero()) && (point2->z == ${CURVE_NAME_U}::point_field_t::zero()));
-}
-
-#if defined(G2_DEFINED)
-extern "C" bool eq_g2_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t *point1, ${CURVE_NAME_U}::g2_projective_t *point2)
-{
-  return (*point1 == *point2) && 
-  !((point1->x == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point1->y == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point1->z == ${CURVE_NAME_U}::g2_point_field_t::zero())) && 
-  !((point2->x == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point2->y == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point2->z == ${CURVE_NAME_U}::g2_point_field_t::zero()));
-}
-
-extern "C" ${CURVE_NAME_U}::g2_projective_t random_g2_projective_${CURVE_NAME_L}()
-{
-  return ${CURVE_NAME_U}::g2_projective_t::rand_host();
-}
-
-extern "C" ${CURVE_NAME_U}::g2_affine_t g2_projective_to_affine_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t *point1)
-{
-  return ${CURVE_NAME_U}::g2_projective_t::to_affine(*point1);
-}
-
-extern "C" ${CURVE_NAME_U}::g2_projective_t g2_projective_from_affine_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_affine_t *point1)
-{
-  return ${CURVE_NAME_U}::g2_projective_t::from_affine(*point1);
-}
-
-extern "C" bool g2_projective_is_on_curve_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t *point1)
-{
-  return ${CURVE_NAME_U}::g2_projective_t::is_on_curve(*point1);
-}
-
-#endif
--- a/icicle/curves/curve_template/projective.cu.tmpl
+++ b/icicle/curves/curve_template/projective.cu.tmpl
@@ -0,0 +1,62 @@
+#include "../../primitives/projective.cuh"
+#include "curve_config.cuh"
+#include <cuda.h>
+
+extern "C" ${CURVE_NAME_U}::projective_t random_projective_${CURVE_NAME_L}() { return ${CURVE_NAME_U}::projective_t::rand_host(); }
+
+extern "C" ${CURVE_NAME_U}::projective_t projective_zero_${CURVE_NAME_L}() { return ${CURVE_NAME_U}::projective_t::zero(); }
+
+extern "C" bool projective_is_on_curve_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* point1)
+{
+  return ${CURVE_NAME_U}::projective_t::is_on_curve(*point1);
+}
+
+extern "C" ${CURVE_NAME_U}::affine_t projective_to_affine_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* point1)
+{
+  return ${CURVE_NAME_U}::projective_t::to_affine(*point1);
+}
+
+extern "C" ${CURVE_NAME_U}::projective_t projective_from_affine_${CURVE_NAME_L}(${CURVE_NAME_U}::affine_t* point1)
+{
+  return ${CURVE_NAME_U}::projective_t::from_affine(*point1);
+}
+
+extern "C" ${CURVE_NAME_U}::scalar_field_t random_scalar_${CURVE_NAME_L}() { return ${CURVE_NAME_U}::scalar_field_t::rand_host(); }
+
+extern "C" bool eq_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* point1, ${CURVE_NAME_U}::projective_t* point2)
+{
+  return (*point1 == *point2) &&
+         !((point1->x == ${CURVE_NAME_U}::point_field_t::zero()) && (point1->y == ${CURVE_NAME_U}::point_field_t::zero()) &&
+           (point1->z == ${CURVE_NAME_U}::point_field_t::zero())) &&
+         !((point2->x == ${CURVE_NAME_U}::point_field_t::zero()) && (point2->y == ${CURVE_NAME_U}::point_field_t::zero()) &&
+           (point2->z == ${CURVE_NAME_U}::point_field_t::zero()));
+}
+
+#if defined(G2_DEFINED)
+extern "C" bool eq_g2_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t* point1, ${CURVE_NAME_U}::g2_projective_t* point2)
+{
+  return (*point1 == *point2) &&
+         !((point1->x == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point1->y == ${CURVE_NAME_U}::g2_point_field_t::zero()) &&
+           (point1->z == ${CURVE_NAME_U}::g2_point_field_t::zero())) &&
+         !((point2->x == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point2->y == ${CURVE_NAME_U}::g2_point_field_t::zero()) &&
+           (point2->z == ${CURVE_NAME_U}::g2_point_field_t::zero()));
+}
+
+extern "C" ${CURVE_NAME_U}::g2_projective_t random_g2_projective_${CURVE_NAME_L}() { return ${CURVE_NAME_U}::g2_projective_t::rand_host(); }
+
+extern "C" ${CURVE_NAME_U}::g2_affine_t g2_projective_to_affine_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t* point1)
+{
+  return ${CURVE_NAME_U}::g2_projective_t::to_affine(*point1);
+}
+
+extern "C" ${CURVE_NAME_U}::g2_projective_t g2_projective_from_affine_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_affine_t* point1)
+{
+  return ${CURVE_NAME_U}::g2_projective_t::from_affine(*point1);
+}
+
+extern "C" bool g2_projective_is_on_curve_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t* point1)
+{
+  return ${CURVE_NAME_U}::g2_projective_t::is_on_curve(*point1);
+}
+
+#endif
--- a/icicle/curves/curve_template/supported_operations.cu.tmpl
+++ b/icicle/curves/curve_template/supported_operations.cu.tmpl
@@ -1,4 +1,4 @@
-#include "projective.cu"
 #include "lde.cu"
 #include "msm.cu"
+#include "projective.cu"
 #include "ve_mod_mult.cu"
--- a/icicle/curves/curve_template/ve_mod_mult.cu.tmpl
+++ b/icicle/curves/curve_template/ve_mod_mult.cu.tmpl
@@ -1,88 +1,70 @@
 #ifndef _${CURVE_NAME_U}_VEC_MULT
 #define _${CURVE_NAME_U}_VEC_MULT
-#include <stdio.h>
-#include <iostream>
-#include "../../primitives/field.cuh"
-#include "../../utils/storage.cuh"
-#include "../../primitives/projective.cuh"
-#include "curve_config.cuh"
 #include "../../appUtils/vector_manipulation/ve_mod_mult.cuh"
+#include "../../primitives/field.cuh"
+#include "../../primitives/projective.cuh"
+#include "../../utils/storage.cuh"
+#include "curve_config.cuh"
+#include <iostream>
+#include <stdio.h>

-
-extern "C" int32_t vec_mod_mult_point_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *inout,
-                                      ${CURVE_NAME_U}::scalar_t *scalar_vec,
-                                      size_t n_elments,
-                                      size_t device_id,
-                                      cudaStream_t stream = 0)
+extern "C" int32_t vec_mod_mult_point_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::projective_t* inout, ${CURVE_NAME_U}::scalar_t* scalar_vec, size_t n_elments, size_t device_id, cudaStream_t stream = 0)
 {
  // TODO: use device_id when working with multiple devices
  (void)device_id;
-  try
-  {
+  try {
    // TODO: device_id
    vector_mod_mult<${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
    return CUDA_SUCCESS;
-  }
-  catch (const std::runtime_error &ex)
-  {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
 }

-extern "C" int32_t vec_mod_mult_scalar_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t *inout,
-                                       ${CURVE_NAME_U}::scalar_t *scalar_vec,
-                                       size_t n_elments,
-                                       size_t device_id,
-                                       cudaStream_t stream = 0)
+extern "C" int32_t vec_mod_mult_scalar_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* inout, ${CURVE_NAME_U}::scalar_t* scalar_vec, size_t n_elments, size_t device_id, cudaStream_t stream = 0)
 {
  // TODO: use device_id when working with multiple devices
  (void)device_id;
-  try
-  {
+  try {
    // TODO: device_id
    vector_mod_mult<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
    return CUDA_SUCCESS;
-  }
-  catch (const std::runtime_error &ex)
-  {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
 }

 extern "C" int32_t vec_mod_mult_device_scalar_${CURVE_NAME_L}(
-    ${CURVE_NAME_U}::scalar_t *inout,
-    ${CURVE_NAME_U}::scalar_t *scalar_vec,
-    size_t n_elements,
-    size_t device_id
-) {
+  ${CURVE_NAME_U}::scalar_t* inout, ${CURVE_NAME_U}::scalar_t* scalar_vec, size_t n_elements, size_t device_id)
+{
  try {
    vector_mod_mult_device<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::scalar_t>(scalar_vec, inout, inout, n_elements);
    return CUDA_SUCCESS;
-  } catch (const std::runtime_error &ex) {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
 }

-extern "C" int32_t matrix_vec_mod_mult_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t *matrix_flattened,
-                                       ${CURVE_NAME_U}::scalar_t *input,
-                                       ${CURVE_NAME_U}::scalar_t *output,
-                                       size_t n_elments,
-                                       size_t device_id,
-                                       cudaStream_t stream = 0)
+extern "C" int32_t matrix_vec_mod_mult_${CURVE_NAME_L}(
+  ${CURVE_NAME_U}::scalar_t* matrix_flattened,
+  ${CURVE_NAME_U}::scalar_t* input,
+  ${CURVE_NAME_U}::scalar_t* output,
+  size_t n_elments,
+  size_t device_id,
+  cudaStream_t stream = 0)
 {
  // TODO: use device_id when working with multiple devices
  (void)device_id;
-  try
-  {
+  try {
    // TODO: device_id
    matrix_mod_mult<${CURVE_NAME_U}::scalar_t>(matrix_flattened, input, output, n_elments, stream);
    return CUDA_SUCCESS;
-  }
-  catch (const std::runtime_error &ex)
-  {
+  } catch (const std::runtime_error& ex) {
    printf("error %s", ex.what()); // TODO: error code and message
    return -1;
  }
--- a/icicle/curves/index.cu
+++ b/icicle/curves/index.cu
@@ -1,3 +1,3 @@
-#include "bls12_381/supported_operations.cu"
 #include "bls12_377/supported_operations.cu"
+#include "bls12_381/supported_operations.cu"
 #include "bn254/supported_operations.cu"
--- a/icicle/primitives/affine.cuh
+++ b/icicle/primitives/affine.cuh
@@ -3,21 +3,22 @@
 #include "field.cuh"

 template <class FF>
-class Affine {  
-  public:
-    FF x;
-    FF y;
+class Affine
+{
+public:
+  FF x;
+  FF y;

-    static HOST_DEVICE_INLINE Affine neg(const Affine &point) { 
-      return {point.x, FF::neg(point.y)}; 
-    }
+  static HOST_DEVICE_INLINE Affine neg(const Affine& point) { return {point.x, FF::neg(point.y)}; }

-    friend HOST_DEVICE_INLINE bool operator==(const Affine& xs, const Affine& ys) {
-      return (xs.x == ys.x) && (xs.y == ys.y);
-    }
+  friend HOST_DEVICE_INLINE bool operator==(const Affine& xs, const Affine& ys)
+  {
+    return (xs.x == ys.x) && (xs.y == ys.y);
+  }

-    friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Affine& point) {
-      os << "x: " << point.x << "; y: " << point.y;
-      return os;
-    }
+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Affine& point)
+  {
+    os << "x: " << point.x << "; y: " << point.y;
+    return os;
+  }
 };
--- a/icicle/primitives/extension_field.cuh
+++ b/icicle/primitives/extension_field.cuh
@@ -2,143 +2,157 @@

 #include "field.cuh"

-#define HOST_INLINE __host__ __forceinline__
-#define DEVICE_INLINE __device__ __forceinline__
+#define HOST_INLINE        __host__ __forceinline__
+#define DEVICE_INLINE      __device__ __forceinline__
 #define HOST_DEVICE_INLINE __host__ __device__ __forceinline__

-template <typename CONFIG> class ExtensionField {
-  private:
-    typedef typename Field<CONFIG>::Wide FWide;
+template <typename CONFIG>
+class ExtensionField
+{
+private:
+  typedef typename Field<CONFIG>::Wide FWide;

-    struct ExtensionWide {
-      FWide real;
-      FWide imaginary;
-  
-      friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys) {   
-        return ExtensionWide { xs.real + ys.real, xs.imaginary + ys.imaginary };
-      }
-  
-      friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys) {   
-        return ExtensionWide { xs.real - ys.real, xs.imaginary - ys.imaginary };
-      }
-    };
+  struct ExtensionWide {
+    FWide real;
+    FWide imaginary;

-  public:
-    typedef Field<CONFIG> FF;
-    static constexpr unsigned TLC = 2 * CONFIG::limbs_count;
-
-    FF real;
-    FF imaginary;
-
-    static constexpr HOST_DEVICE_INLINE ExtensionField zero() {
-      return ExtensionField { FF::zero(), FF::zero() };
+    friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
+    {
+      return ExtensionWide{xs.real + ys.real, xs.imaginary + ys.imaginary};
    }

-    static constexpr HOST_DEVICE_INLINE ExtensionField one() {
-      return ExtensionField { FF::one(), FF::zero() };
+    friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
+    {
+      return ExtensionWide{xs.real - ys.real, xs.imaginary - ys.imaginary};
    }
+  };

-    static constexpr HOST_DEVICE_INLINE ExtensionField generator_x() {
-      return ExtensionField { FF { CONFIG::g2_gen_x_re }, FF { CONFIG::g2_gen_x_im } };
-    }
+public:
+  typedef Field<CONFIG> FF;
+  static constexpr unsigned TLC = 2 * CONFIG::limbs_count;

-    static constexpr HOST_DEVICE_INLINE ExtensionField generator_y() {
-      return ExtensionField { FF { CONFIG::g2_gen_y_re }, FF { CONFIG::g2_gen_y_im } };
-    }
+  FF real;
+  FF imaginary;

-    static HOST_INLINE ExtensionField rand_host() {
-      return ExtensionField { FF::rand_host(), FF::rand_host() };
-    }
+  static constexpr HOST_DEVICE_INLINE ExtensionField zero() { return ExtensionField{FF::zero(), FF::zero()}; }

-    template <unsigned REDUCTION_SIZE = 1> static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField &xs) {
-      return ExtensionField { FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.imaginary) };
-    }
+  static constexpr HOST_DEVICE_INLINE ExtensionField one() { return ExtensionField{FF::one(), FF::zero()}; }

-    friend std::ostream& operator<<(std::ostream& os, const ExtensionField& xs) {
-      os << "{ Real: " << xs.real << " }; { Imaginary: " << xs.imaginary << " }";
-      return os;
-    }
+  static constexpr HOST_DEVICE_INLINE ExtensionField generator_x()
+  {
+    return ExtensionField{FF{CONFIG::g2_gen_x_re}, FF{CONFIG::g2_gen_x_im}};
+  }

-    friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const ExtensionField& ys) {
-      return ExtensionField { xs.real + ys.real, xs.imaginary + ys.imaginary };
-    }
+  static constexpr HOST_DEVICE_INLINE ExtensionField generator_y()
+  {
+    return ExtensionField{FF{CONFIG::g2_gen_y_re}, FF{CONFIG::g2_gen_y_im}};
+  }

-    friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const ExtensionField& ys) {
-      return ExtensionField { xs.real - ys.real, xs.imaginary - ys.imaginary };
-    }
+  static HOST_INLINE ExtensionField rand_host() { return ExtensionField{FF::rand_host(), FF::rand_host()}; }

-    template <unsigned MODULUS_MULTIPLE = 1>
-    static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys) {
-      FWide real_prod = FF::mul_wide(xs.real, ys.real);
-      FWide imaginary_prod = FF::mul_wide(xs.imaginary, ys.imaginary);
-      FWide prod_of_sums = FF::mul_wide(xs.real + xs.imaginary, ys.real + ys.imaginary);
-      FWide i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(imaginary_prod);
-      i_sq_times_im = CONFIG::i_squared_is_negative ? FWide::neg(i_sq_times_im) : i_sq_times_im;
-      return ExtensionWide { real_prod + i_sq_times_im, prod_of_sums - real_prod - imaginary_prod };
-    }
+  template <unsigned REDUCTION_SIZE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs)
+  {
+    return ExtensionField{FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.imaginary)};
+  }

-    template <unsigned MODULUS_MULTIPLE = 1>
-    static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs) {
-      return ExtensionField { FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.imaginary) };
-    }
+  friend std::ostream& operator<<(std::ostream& os, const ExtensionField& xs)
+  {
+    os << "{ Real: " << xs.real << " }; { Imaginary: " << xs.imaginary << " }";
+    return os;
+  }

-    friend HOST_DEVICE_INLINE ExtensionField operator*(const ExtensionField& xs, const ExtensionField& ys) {
-      ExtensionWide xy = mul_wide(xs, ys);
-      return reduce(xy);
-    }
+  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs.real + ys.real, xs.imaginary + ys.imaginary};
+  }

-    friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys) {
-      return (xs.real == ys.real) && (xs.imaginary == ys.imaginary);
-    }
+  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs.real - ys.real, xs.imaginary - ys.imaginary};
+  }

-    friend HOST_DEVICE_INLINE bool operator!=(const ExtensionField& xs, const ExtensionField& ys) {
-      return !(xs == ys);
-    }
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
+  {
+    FWide real_prod = FF::mul_wide(xs.real, ys.real);
+    FWide imaginary_prod = FF::mul_wide(xs.imaginary, ys.imaginary);
+    FWide prod_of_sums = FF::mul_wide(xs.real + xs.imaginary, ys.real + ys.imaginary);
+    FWide i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(imaginary_prod);
+    i_sq_times_im = CONFIG::i_squared_is_negative ? FWide::neg(i_sq_times_im) : i_sq_times_im;
+    return ExtensionWide{real_prod + i_sq_times_im, prod_of_sums - real_prod - imaginary_prod};
+  }

-    template <const ExtensionField& multiplier>
-    static HOST_DEVICE_INLINE ExtensionField mul_const(const ExtensionField &xs) {
-      static constexpr FF mul_real = multiplier.real;
-      static constexpr FF mul_imaginary = multiplier.imaginary;
-      const FF xs_real = xs.real;
-      const FF xs_imaginary = xs.imaginary;
-      FF real_prod = FF::template mul_const<mul_real>(xs_real);
-      FF imaginary_prod = FF::template mul_const<mul_imaginary>(xs_imaginary);
-      FF re_im = FF::template mul_const<mul_real>(xs_imaginary);
-      FF im_re = FF::template mul_const<mul_imaginary>(xs_real);
-      FF i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(imaginary_prod);
-      i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im;
-      return ExtensionField { real_prod + i_sq_times_im, re_im + im_re };
-    }
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs)
+  {
+    return ExtensionField{
+      FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.imaginary)};
+  }

-    template <uint32_t mutliplier, unsigned REDUCTION_SIZE = 1>
-    static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField &xs) {
-      return { FF::template mul_unsigned<mutliplier>(xs.real), FF::template mul_unsigned<mutliplier>(xs.imaginary) };
-    }
+  friend HOST_DEVICE_INLINE ExtensionField operator*(const ExtensionField& xs, const ExtensionField& ys)
+  {
+    ExtensionWide xy = mul_wide(xs, ys);
+    return reduce(xy);
+  }

-    template <unsigned MODULUS_MULTIPLE = 1>
-    static constexpr HOST_DEVICE_INLINE ExtensionWide sqr_wide(const ExtensionField& xs) {
-      // TODO: change to a more efficient squaring
-      return mul_wide<MODULUS_MULTIPLE>(xs, xs);
-    }
+  friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys)
+  {
+    return (xs.real == ys.real) && (xs.imaginary == ys.imaginary);
+  }

-    template <unsigned MODULUS_MULTIPLE = 1>
-    static constexpr HOST_DEVICE_INLINE ExtensionField sqr(const ExtensionField& xs) {
-      // TODO: change to a more efficient squaring
-      return xs * xs;
-    }
+  friend HOST_DEVICE_INLINE bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }

-    template <unsigned MODULUS_MULTIPLE = 1>
-    static constexpr HOST_DEVICE_INLINE ExtensionField neg(const ExtensionField& xs) {
-      return ExtensionField { FF::neg(xs.real), FF::neg(xs.imaginary) };
-    }
+  template <const ExtensionField& multiplier>
+  static HOST_DEVICE_INLINE ExtensionField mul_const(const ExtensionField& xs)
+  {
+    static constexpr FF mul_real = multiplier.real;
+    static constexpr FF mul_imaginary = multiplier.imaginary;
+    const FF xs_real = xs.real;
+    const FF xs_imaginary = xs.imaginary;
+    FF real_prod = FF::template mul_const<mul_real>(xs_real);
+    FF imaginary_prod = FF::template mul_const<mul_imaginary>(xs_imaginary);
+    FF re_im = FF::template mul_const<mul_real>(xs_imaginary);
+    FF im_re = FF::template mul_const<mul_imaginary>(xs_real);
+    FF i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(imaginary_prod);
+    i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im;
+    return ExtensionField{real_prod + i_sq_times_im, re_im + im_re};
+  }

-    // inverse assumes that xs is nonzero
-    static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs) {
-      ExtensionField xs_conjugate = { xs.real, FF::neg(xs.imaginary) };
-      FF i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(FF::sqr(xs.imaginary));
-      i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im;
-      // TODO: wide here
-      FF xs_norm_squared = FF::sqr(xs.real) - i_sq_times_im;
-      return xs_conjugate * ExtensionField { FF::inverse(xs_norm_squared), FF::zero() };
-    }
+  template <uint32_t mutliplier, unsigned REDUCTION_SIZE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs)
+  {
+    return {FF::template mul_unsigned<mutliplier>(xs.real), FF::template mul_unsigned<mutliplier>(xs.imaginary)};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide sqr_wide(const ExtensionField& xs)
+  {
+    // TODO: change to a more efficient squaring
+    return mul_wide<MODULUS_MULTIPLE>(xs, xs);
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField sqr(const ExtensionField& xs)
+  {
+    // TODO: change to a more efficient squaring
+    return xs * xs;
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField neg(const ExtensionField& xs)
+  {
+    return ExtensionField{FF::neg(xs.real), FF::neg(xs.imaginary)};
+  }
+
+  // inverse assumes that xs is nonzero
+  static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs)
+  {
+    ExtensionField xs_conjugate = {xs.real, FF::neg(xs.imaginary)};
+    FF i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(FF::sqr(xs.imaginary));
+    i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im;
+    // TODO: wide here
+    FF xs_norm_squared = FF::sqr(xs.real) - i_sq_times_im;
+    return xs_conjugate * ExtensionField{FF::inverse(xs_norm_squared), FF::zero()};
+  }
 };
--- a/icicle/primitives/field.cuh
+++ b/icicle/primitives/field.cuh
--- a/icicle/primitives/projective.cu
+++ b/icicle/primitives/projective.cu
@@ -1,49 +1,61 @@
-#include <cuda.h>
-#include "../curves/bls12_381/curve_config.cuh"
 #include "../curves/bls12_377/curve_config.cuh"
+#include "../curves/bls12_381/curve_config.cuh"
 #include "../curves/bn254/curve_config.cuh"
 #include "projective.cuh"
+#include <cuda.h>

-extern "C" bool eq_bls12_381(BLS12_381::projective_t *point1, BLS12_381::projective_t *point2)
+extern "C" bool eq_bls12_381(BLS12_381::projective_t* point1, BLS12_381::projective_t* point2)
 {
-    return (*point1 == *point2) && 
-    !((point1->x == BLS12_381::point_field_t::zero()) && (point1->y == BLS12_381::point_field_t::zero()) && (point1->z == BLS12_381::point_field_t::zero())) && 
-    !((point2->x == BLS12_381::point_field_t::zero()) && (point2->y == BLS12_381::point_field_t::zero()) && (point2->z == BLS12_381::point_field_t::zero()));
+  return (*point1 == *point2) &&
+         !((point1->x == BLS12_381::point_field_t::zero()) && (point1->y == BLS12_381::point_field_t::zero()) &&
+           (point1->z == BLS12_381::point_field_t::zero())) &&
+         !((point2->x == BLS12_381::point_field_t::zero()) && (point2->y == BLS12_381::point_field_t::zero()) &&
+           (point2->z == BLS12_381::point_field_t::zero()));
 }

-extern "C" bool eq_bls12_377(BLS12_377::projective_t *point1, BLS12_377::projective_t *point2)
+extern "C" bool eq_bls12_377(BLS12_377::projective_t* point1, BLS12_377::projective_t* point2)
 {
-    return (*point1 == *point2) && 
-    !((point1->x == BLS12_377::point_field_t::zero()) && (point1->y == BLS12_377::point_field_t::zero()) && (point1->z == BLS12_377::point_field_t::zero())) && 
-    !((point2->x == BLS12_377::point_field_t::zero()) && (point2->y == BLS12_377::point_field_t::zero()) && (point2->z == BLS12_377::point_field_t::zero()));
+  return (*point1 == *point2) &&
+         !((point1->x == BLS12_377::point_field_t::zero()) && (point1->y == BLS12_377::point_field_t::zero()) &&
+           (point1->z == BLS12_377::point_field_t::zero())) &&
+         !((point2->x == BLS12_377::point_field_t::zero()) && (point2->y == BLS12_377::point_field_t::zero()) &&
+           (point2->z == BLS12_377::point_field_t::zero()));
 }

-extern "C" bool eq_bn254(BN254::projective_t *point1, BN254::projective_t *point2)
+extern "C" bool eq_bn254(BN254::projective_t* point1, BN254::projective_t* point2)
 {
-  return (*point1 == *point2) && 
-  !((point1->x == BN254::point_field_t::zero()) && (point1->y == BN254::point_field_t::zero()) && (point1->z == BN254::point_field_t::zero())) && 
-  !((point2->x == BN254::point_field_t::zero()) && (point2->y == BN254::point_field_t::zero()) && (point2->z == BN254::point_field_t::zero()));
+  return (*point1 == *point2) &&
+         !((point1->x == BN254::point_field_t::zero()) && (point1->y == BN254::point_field_t::zero()) &&
+           (point1->z == BN254::point_field_t::zero())) &&
+         !((point2->x == BN254::point_field_t::zero()) && (point2->y == BN254::point_field_t::zero()) &&
+           (point2->z == BN254::point_field_t::zero()));
 }

 #if defined(G2_DEFINED)
-extern "C" bool eq_g2_bls12_381(BLS12_381::g2_projective_t *point1, BLS12_381::g2_projective_t *point2)
+extern "C" bool eq_g2_bls12_381(BLS12_381::g2_projective_t* point1, BLS12_381::g2_projective_t* point2)
 {
-  return (*point1 == *point2) && 
-  !((point1->x == BLS12_381::g2_point_field_t::zero()) && (point1->y == BLS12_381::g2_point_field_t::zero()) && (point1->z == BLS12_381::g2_point_field_t::zero())) && 
-  !((point2->x == BLS12_381::g2_point_field_t::zero()) && (point2->y == BLS12_381::g2_point_field_t::zero()) && (point2->z == BLS12_381::g2_point_field_t::zero()));
+  return (*point1 == *point2) &&
+         !((point1->x == BLS12_381::g2_point_field_t::zero()) && (point1->y == BLS12_381::g2_point_field_t::zero()) &&
+           (point1->z == BLS12_381::g2_point_field_t::zero())) &&
+         !((point2->x == BLS12_381::g2_point_field_t::zero()) && (point2->y == BLS12_381::g2_point_field_t::zero()) &&
+           (point2->z == BLS12_381::g2_point_field_t::zero()));
 }

-extern "C" bool eq_g2_bls12_377(BLS12_377::g2_projective_t *point1, BLS12_377::g2_projective_t *point2)
+extern "C" bool eq_g2_bls12_377(BLS12_377::g2_projective_t* point1, BLS12_377::g2_projective_t* point2)
 {
-  return (*point1 == *point2) && 
-  !((point1->x == BLS12_377::g2_point_field_t::zero()) && (point1->y == BLS12_377::g2_point_field_t::zero()) && (point1->z == BLS12_377::g2_point_field_t::zero())) && 
-  !((point2->x == BLS12_377::g2_point_field_t::zero()) && (point2->y == BLS12_377::g2_point_field_t::zero()) && (point2->z == BLS12_377::g2_point_field_t::zero()));
+  return (*point1 == *point2) &&
+         !((point1->x == BLS12_377::g2_point_field_t::zero()) && (point1->y == BLS12_377::g2_point_field_t::zero()) &&
+           (point1->z == BLS12_377::g2_point_field_t::zero())) &&
+         !((point2->x == BLS12_377::g2_point_field_t::zero()) && (point2->y == BLS12_377::g2_point_field_t::zero()) &&
+           (point2->z == BLS12_377::g2_point_field_t::zero()));
 }

-extern "C" bool eq_g2_bn254(BN254::g2_projective_t *point1, BN254::g2_projective_t *point2)
+extern "C" bool eq_g2_bn254(BN254::g2_projective_t* point1, BN254::g2_projective_t* point2)
 {
-  return (*point1 == *point2) && 
-  !((point1->x == BN254::g2_point_field_t::zero()) && (point1->y == BN254::g2_point_field_t::zero()) && (point1->z == BN254::g2_point_field_t::zero())) && 
-  !((point2->x == BN254::g2_point_field_t::zero()) && (point2->y == BN254::g2_point_field_t::zero()) && (point2->z == BN254::g2_point_field_t::zero()));
+  return (*point1 == *point2) &&
+         !((point1->x == BN254::g2_point_field_t::zero()) && (point1->y == BN254::g2_point_field_t::zero()) &&
+           (point1->z == BN254::g2_point_field_t::zero())) &&
+         !((point2->x == BN254::g2_point_field_t::zero()) && (point2->y == BN254::g2_point_field_t::zero()) &&
+           (point2->z == BN254::g2_point_field_t::zero()));
 }
 #endif
--- a/icicle/primitives/projective.cuh
+++ b/icicle/primitives/projective.cuh
@@ -3,170 +3,164 @@
 #include "affine.cuh"

 template <typename FF, class SCALAR_FF, const FF& B_VALUE>
-class Projective {
+class Projective
+{
  friend Affine<FF>;

-  public:
-    FF x;
-    FF y;
-    FF z;
+public:
+  FF x;
+  FF y;
+  FF z;

-    static HOST_DEVICE_INLINE Projective zero() {
-      return {FF::zero(), FF::one(), FF::zero()};
-    }
+  static HOST_DEVICE_INLINE Projective zero() { return {FF::zero(), FF::one(), FF::zero()}; }

-    static HOST_DEVICE_INLINE Affine<FF> to_affine(const Projective &point) {
-      FF denom = FF::inverse(point.z);
-      return {point.x * denom, point.y * denom};
-    }
+  static HOST_DEVICE_INLINE Affine<FF> to_affine(const Projective& point)
+  {
+    FF denom = FF::inverse(point.z);
+    return {point.x * denom, point.y * denom};
+  }

-    static HOST_DEVICE_INLINE Projective from_affine(const Affine<FF> &point) {
-      return {point.x, point.y, FF::one()};
-    }
+  static HOST_DEVICE_INLINE Projective from_affine(const Affine<FF>& point) { return {point.x, point.y, FF::one()}; }

-    static HOST_DEVICE_INLINE Projective generator() {
-      return {FF::generator_x(), FF::generator_y(), FF::one()};
-    }
+  static HOST_DEVICE_INLINE Projective generator() { return {FF::generator_x(), FF::generator_y(), FF::one()}; }

-    static HOST_DEVICE_INLINE Projective neg(const Projective &point) { 
-      return {point.x, FF::neg(point.y), point.z};
-    }
+  static HOST_DEVICE_INLINE Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; }

-    friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Projective& p2) {   
-      const FF X1 = p1.x;                                      //                   < 2
-      const FF Y1 = p1.y;                                      //                   < 2
-      const FF Z1 = p1.z;                                      //                   < 2
-      const FF X2 = p2.x;                                      //                   < 2
-      const FF Y2 = p2.y;                                      //                   < 2
-      const FF Z2 = p2.z;                                      //                   < 2
-      const FF t00 = X1 * X2;                                  // t00 ← X1 · X2     < 2
-      const FF t01 = Y1 * Y2;                                  // t01 ← Y1 · Y2     < 2
-      const FF t02 = Z1 * Z2;                                  // t02 ← Z1 · Z2     < 2
-      const FF t03 = X1 + Y1;                                  // t03 ← X1 + Y1     < 4
-      const FF t04 = X2 + Y2;                                  // t04 ← X2 + Y2     < 4
-      const FF t05 = t03 * t04;                                // t03 ← t03 · t04   < 3
-      const FF t06 = t00 + t01;                                // t06 ← t00 + t01   < 4
-      const FF t07 = t05 - t06;                                // t05 ← t05 − t06   < 2
-      const FF t08 = Y1 + Z1;                                  // t08 ← Y1 + Z1     < 4
-      const FF t09 = Y2 + Z2;                                  // t09 ← Y2 + Z2     < 4
-      const FF t10 = t08 * t09;                                // t10 ← t08 · t09   < 3
-      const FF t11 = t01 + t02;                                // t11 ← t01 + t02   < 4
-      const FF t12 = t10 - t11;                                // t12 ← t10 − t11   < 2
-      const FF t13 = X1 + Z1;                                  // t13 ← X1 + Z1     < 4
-      const FF t14 = X2 + Z2;                                  // t14 ← X2 + Z2     < 4
-      const FF t15 = t13 * t14;                                // t15 ← t13 · t14   < 3
-      const FF t16 = t00 + t02;                                // t16 ← t00 + t02   < 4
-      const FF t17 = t15 - t16;                                // t17 ← t15 − t16   < 2
-      const FF t18 = t00 + t00;                                // t18 ← t00 + t00   < 2
-      const FF t19 = t18 + t00;                                // t19 ← t18 + t00   < 2
-      const FF t20 = FF::template mul_unsigned<3>(
-        FF::template mul_const<B_VALUE>(t02));                 // t20 ← b3 · t02    < 2
-      const FF t21 = t01 + t20;                                // t21 ← t01 + t20   < 2
-      const FF t22 = t01 - t20;                                // t22 ← t01 − t20   < 2
-      const FF t23 = FF::template mul_unsigned<3>(
-        FF::template mul_const<B_VALUE>(t17));                 // t23 ← b3 · t17    < 2
-      const auto t24 = FF::mul_wide(t12, t23);                 // t24 ← t12 · t23   < 2
-      const auto t25 = FF::mul_wide(t07, t22);                 // t25 ← t07 · t22   < 2
-      const FF X3 = FF::reduce(t25 - t24);                     // X3 ← t25 − t24    < 2
-      const auto t27 = FF::mul_wide(t23, t19);                 // t27 ← t23 · t19   < 2
-      const auto t28 = FF::mul_wide(t22, t21);                 // t28 ← t22 · t21   < 2
-      const FF Y3 = FF::reduce(t28 + t27);                     // Y3 ← t28 + t27    < 2
-      const auto t30 = FF::mul_wide(t19, t07);                 // t30 ← t19 · t07   < 2
-      const auto t31 = FF::mul_wide(t21, t12);                 // t31 ← t21 · t12   < 2
-      const FF Z3 = FF::reduce(t31 + t30);                     // Z3 ← t31 + t30    < 2
-      return {X3, Y3, Z3};
-    }
+  friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Projective& p2)
+  {
+    const FF X1 = p1.x;                                                                //                   < 2
+    const FF Y1 = p1.y;                                                                //                   < 2
+    const FF Z1 = p1.z;                                                                //                   < 2
+    const FF X2 = p2.x;                                                                //                   < 2
+    const FF Y2 = p2.y;                                                                //                   < 2
+    const FF Z2 = p2.z;                                                                //                   < 2
+    const FF t00 = X1 * X2;                                                            // t00 ← X1 · X2     < 2
+    const FF t01 = Y1 * Y2;                                                            // t01 ← Y1 · Y2     < 2
+    const FF t02 = Z1 * Z2;                                                            // t02 ← Z1 · Z2     < 2
+    const FF t03 = X1 + Y1;                                                            // t03 ← X1 + Y1     < 4
+    const FF t04 = X2 + Y2;                                                            // t04 ← X2 + Y2     < 4
+    const FF t05 = t03 * t04;                                                          // t03 ← t03 · t04   < 3
+    const FF t06 = t00 + t01;                                                          // t06 ← t00 + t01   < 4
+    const FF t07 = t05 - t06;                                                          // t05 ← t05 − t06   < 2
+    const FF t08 = Y1 + Z1;                                                            // t08 ← Y1 + Z1     < 4
+    const FF t09 = Y2 + Z2;                                                            // t09 ← Y2 + Z2     < 4
+    const FF t10 = t08 * t09;                                                          // t10 ← t08 · t09   < 3
+    const FF t11 = t01 + t02;                                                          // t11 ← t01 + t02   < 4
+    const FF t12 = t10 - t11;                                                          // t12 ← t10 − t11   < 2
+    const FF t13 = X1 + Z1;                                                            // t13 ← X1 + Z1     < 4
+    const FF t14 = X2 + Z2;                                                            // t14 ← X2 + Z2     < 4
+    const FF t15 = t13 * t14;                                                          // t15 ← t13 · t14   < 3
+    const FF t16 = t00 + t02;                                                          // t16 ← t00 + t02   < 4
+    const FF t17 = t15 - t16;                                                          // t17 ← t15 − t16   < 2
+    const FF t18 = t00 + t00;                                                          // t18 ← t00 + t00   < 2
+    const FF t19 = t18 + t00;                                                          // t19 ← t18 + t00   < 2
+    const FF t20 = FF::template mul_unsigned<3>(FF::template mul_const<B_VALUE>(t02)); // t20 ← b3 · t02    < 2
+    const FF t21 = t01 + t20;                                                          // t21 ← t01 + t20   < 2
+    const FF t22 = t01 - t20;                                                          // t22 ← t01 − t20   < 2
+    const FF t23 = FF::template mul_unsigned<3>(FF::template mul_const<B_VALUE>(t17)); // t23 ← b3 · t17    < 2
+    const auto t24 = FF::mul_wide(t12, t23);                                           // t24 ← t12 · t23   < 2
+    const auto t25 = FF::mul_wide(t07, t22);                                           // t25 ← t07 · t22   < 2
+    const FF X3 = FF::reduce(t25 - t24);                                               // X3 ← t25 − t24    < 2
+    const auto t27 = FF::mul_wide(t23, t19);                                           // t27 ← t23 · t19   < 2
+    const auto t28 = FF::mul_wide(t22, t21);                                           // t28 ← t22 · t21   < 2
+    const FF Y3 = FF::reduce(t28 + t27);                                               // Y3 ← t28 + t27    < 2
+    const auto t30 = FF::mul_wide(t19, t07);                                           // t30 ← t19 · t07   < 2
+    const auto t31 = FF::mul_wide(t21, t12);                                           // t31 ← t21 · t12   < 2
+    const FF Z3 = FF::reduce(t31 + t30);                                               // Z3 ← t31 + t30    < 2
+    return {X3, Y3, Z3};
+  }

-    friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Projective& p2) {   
-      return p1 + neg(p2);
-    }
+  friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Projective& p2) { return p1 + neg(p2); }

-    friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Affine<FF>& p2) {   
-      const FF X1 = p1.x;                                      //                   < 2
-      const FF Y1 = p1.y;                                      //                   < 2
-      const FF Z1 = p1.z;                                      //                   < 2
-      const FF X2 = p2.x;                                      //                   < 2
-      const FF Y2 = p2.y;                                      //                   < 2
-      const FF t00 = X1 * X2;                                  // t00 ← X1 · X2     < 2
-      const FF t01 = Y1 * Y2;                                  // t01 ← Y1 · Y2     < 2
-      const FF t02 = Z1;                                       // t02 ← Z1          < 2
-      const FF t03 = X1 + Y1;                                  // t03 ← X1 + Y1     < 4
-      const FF t04 = X2 + Y2;                                  // t04 ← X2 + Y2     < 4
-      const FF t05 = t03 * t04;                                // t03 ← t03 · t04   < 3
-      const FF t06 = t00 + t01;                                // t06 ← t00 + t01   < 4
-      const FF t07 = t05 - t06;                                // t05 ← t05 − t06   < 2
-      const FF t08 = Y1 + Z1;                                  // t08 ← Y1 + Z1     < 4
-      const FF t09 = Y2 + FF::one();                           // t09 ← Y2 + 1      < 4
-      const FF t10 = t08 * t09;                                // t10 ← t08 · t09   < 3
-      const FF t11 = t01 + t02;                                // t11 ← t01 + t02   < 4
-      const FF t12 = t10 - t11;                                // t12 ← t10 − t11   < 2
-      const FF t13 = X1 + Z1;                                  // t13 ← X1 + Z1     < 4
-      const FF t14 = X2 + FF::one();                           // t14 ← X2 + 1      < 4
-      const FF t15 = t13 * t14;                                // t15 ← t13 · t14   < 3
-      const FF t16 = t00 + t02;                                // t16 ← t00 + t02   < 4
-      const FF t17 = t15 - t16;                                // t17 ← t15 − t16   < 2
-      const FF t18 = t00 + t00;                                // t18 ← t00 + t00   < 2
-      const FF t19 = t18 + t00;                                // t19 ← t18 + t00   < 2
-      const FF t20 = FF::template mul_unsigned<3>(
-        FF::template mul_const<B_VALUE>(t02));                 // t20 ← b3 · t02    < 2
-      const FF t21 = t01 + t20;                                // t21 ← t01 + t20   < 2
-      const FF t22 = t01 - t20;                                // t22 ← t01 − t20   < 2
-      const FF t23 = FF::template mul_unsigned<3>(
-        FF::template mul_const<B_VALUE>(t17));                 // t23 ← b3 · t17    < 2
-      const auto t24 = FF::mul_wide(t12, t23);                 // t24 ← t12 · t23   < 2
-      const auto t25 = FF::mul_wide(t07, t22);                 // t25 ← t07 · t22   < 2
-      const FF X3 = FF::reduce(t25 - t24);                     // X3 ← t25 − t24    < 2
-      const auto t27 = FF::mul_wide(t23, t19);                 // t27 ← t23 · t19   < 2
-      const auto t28 = FF::mul_wide(t22, t21);                 // t28 ← t22 · t21   < 2
-      const FF Y3 = FF::reduce(t28 + t27);                     // Y3 ← t28 + t27    < 2
-      const auto t30 = FF::mul_wide(t19, t07);                 // t30 ← t19 · t07   < 2
-      const auto t31 = FF::mul_wide(t21, t12);                 // t31 ← t21 · t12   < 2
-      const FF Z3 = FF::reduce(t31 + t30);                     // Z3 ← t31 + t30    < 2
-      return {X3, Y3, Z3};
-    }
+  friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Affine<FF>& p2)
+  {
+    const FF X1 = p1.x;                                                                //                   < 2
+    const FF Y1 = p1.y;                                                                //                   < 2
+    const FF Z1 = p1.z;                                                                //                   < 2
+    const FF X2 = p2.x;                                                                //                   < 2
+    const FF Y2 = p2.y;                                                                //                   < 2
+    const FF t00 = X1 * X2;                                                            // t00 ← X1 · X2     < 2
+    const FF t01 = Y1 * Y2;                                                            // t01 ← Y1 · Y2     < 2
+    const FF t02 = Z1;                                                                 // t02 ← Z1          < 2
+    const FF t03 = X1 + Y1;                                                            // t03 ← X1 + Y1     < 4
+    const FF t04 = X2 + Y2;                                                            // t04 ← X2 + Y2     < 4
+    const FF t05 = t03 * t04;                                                          // t03 ← t03 · t04   < 3
+    const FF t06 = t00 + t01;                                                          // t06 ← t00 + t01   < 4
+    const FF t07 = t05 - t06;                                                          // t05 ← t05 − t06   < 2
+    const FF t08 = Y1 + Z1;                                                            // t08 ← Y1 + Z1     < 4
+    const FF t09 = Y2 + FF::one();                                                     // t09 ← Y2 + 1      < 4
+    const FF t10 = t08 * t09;                                                          // t10 ← t08 · t09   < 3
+    const FF t11 = t01 + t02;                                                          // t11 ← t01 + t02   < 4
+    const FF t12 = t10 - t11;                                                          // t12 ← t10 − t11   < 2
+    const FF t13 = X1 + Z1;                                                            // t13 ← X1 + Z1     < 4
+    const FF t14 = X2 + FF::one();                                                     // t14 ← X2 + 1      < 4
+    const FF t15 = t13 * t14;                                                          // t15 ← t13 · t14   < 3
+    const FF t16 = t00 + t02;                                                          // t16 ← t00 + t02   < 4
+    const FF t17 = t15 - t16;                                                          // t17 ← t15 − t16   < 2
+    const FF t18 = t00 + t00;                                                          // t18 ← t00 + t00   < 2
+    const FF t19 = t18 + t00;                                                          // t19 ← t18 + t00   < 2
+    const FF t20 = FF::template mul_unsigned<3>(FF::template mul_const<B_VALUE>(t02)); // t20 ← b3 · t02    < 2
+    const FF t21 = t01 + t20;                                                          // t21 ← t01 + t20   < 2
+    const FF t22 = t01 - t20;                                                          // t22 ← t01 − t20   < 2
+    const FF t23 = FF::template mul_unsigned<3>(FF::template mul_const<B_VALUE>(t17)); // t23 ← b3 · t17    < 2
+    const auto t24 = FF::mul_wide(t12, t23);                                           // t24 ← t12 · t23   < 2
+    const auto t25 = FF::mul_wide(t07, t22);                                           // t25 ← t07 · t22   < 2
+    const FF X3 = FF::reduce(t25 - t24);                                               // X3 ← t25 − t24    < 2
+    const auto t27 = FF::mul_wide(t23, t19);                                           // t27 ← t23 · t19   < 2
+    const auto t28 = FF::mul_wide(t22, t21);                                           // t28 ← t22 · t21   < 2
+    const FF Y3 = FF::reduce(t28 + t27);                                               // Y3 ← t28 + t27    < 2
+    const auto t30 = FF::mul_wide(t19, t07);                                           // t30 ← t19 · t07   < 2
+    const auto t31 = FF::mul_wide(t21, t12);                                           // t31 ← t21 · t12   < 2
+    const FF Z3 = FF::reduce(t31 + t30);                                               // Z3 ← t31 + t30    < 2
+    return {X3, Y3, Z3};
+  }

-    friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Affine<FF>& p2) {   
-      return p1 + Affine<FF>::neg(p2);
-    }
+  friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Affine<FF>& p2)
+  {
+    return p1 + Affine<FF>::neg(p2);
+  }

-    friend HOST_DEVICE_INLINE Projective operator*(SCALAR_FF scalar, const Projective& point) {   
-      Projective res = zero();
-  #ifdef __CUDA_ARCH__
-  #pragma unroll
-  #endif
-      for (int i = 0; i < SCALAR_FF::NBITS; i++) {
-        if (i > 0) {
-          res = res + res;
-        }
-        if (scalar.get_scalar_digit(SCALAR_FF::NBITS - i - 1, 1)) {
-          res = res + point;
-        }
-      }
-      return res;
+  friend HOST_DEVICE_INLINE Projective operator*(SCALAR_FF scalar, const Projective& point)
+  {
+    Projective res = zero();
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+    for (int i = 0; i < SCALAR_FF::NBITS; i++) {
+      if (i > 0) { res = res + res; }
+      if (scalar.get_scalar_digit(SCALAR_FF::NBITS - i - 1, 1)) { res = res + point; }
    }
+    return res;
+  }

-    friend HOST_DEVICE_INLINE bool operator==(const Projective& p1, const Projective& p2) {
-      return (p1.x * p2.z == p2.x * p1.z) && (p1.y * p2.z == p2.y * p1.z);
-    }
+  friend HOST_DEVICE_INLINE bool operator==(const Projective& p1, const Projective& p2)
+  {
+    return (p1.x * p2.z == p2.x * p1.z) && (p1.y * p2.z == p2.y * p1.z);
+  }

-    friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Projective& point) {
-      os << "Point { x: " << point.x << "; y: " << point.y << "; z: " << point.z << " }";
-      return os;
-    }
+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Projective& point)
+  {
+    os << "Point { x: " << point.x << "; y: " << point.y << "; z: " << point.z << " }";
+    return os;
+  }

-    static HOST_DEVICE_INLINE bool is_zero(const Projective &point) {
-      return point.x == FF::zero() && point.y != FF::zero() && point.z == FF::zero();
-    }
+  static HOST_DEVICE_INLINE bool is_zero(const Projective& point)
+  {
+    return point.x == FF::zero() && point.y != FF::zero() && point.z == FF::zero();
+  }

-    static HOST_DEVICE_INLINE bool is_on_curve(const Projective &point) {
-      if (is_zero(point))
-        return true;
-      bool eq_holds = (FF::template mul_const<B_VALUE>(FF::sqr(point.z) * point.z) + FF::sqr(point.x) * point.x == point.z * FF::sqr(point.y));
-      return point.z != FF::zero() && eq_holds;
-    }
+  static HOST_DEVICE_INLINE bool is_on_curve(const Projective& point)
+  {
+    if (is_zero(point)) return true;
+    bool eq_holds =
+      (FF::template mul_const<B_VALUE>(FF::sqr(point.z) * point.z) + FF::sqr(point.x) * point.x ==
+       point.z * FF::sqr(point.y));
+    return point.z != FF::zero() && eq_holds;
+  }

-    static HOST_INLINE Projective rand_host() {
-      SCALAR_FF rand_scalar = SCALAR_FF::rand_host();
-      return rand_scalar * generator();
-    }
+  static HOST_INLINE Projective rand_host()
+  {
+    SCALAR_FF rand_scalar = SCALAR_FF::rand_host();
+    return rand_scalar * generator();
+  }
 };
--- a/icicle/primitives/test.cu
+++ b/icicle/primitives/test.cu
@@ -1,62 +1,65 @@
+#include "test_kernels.cuh"
+#include <boost/multiprecision/cpp_int.hpp>
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
-#include "test_kernels.cuh"
 #include <iostream>
-#include <boost/multiprecision/cpp_int.hpp>
 namespace mp = boost::multiprecision;

 template <class T>
-int device_populate_random(T* d_elements, unsigned n) {
-    T h_elements[n];
-    for (unsigned i = 0; i < n; i++)
-        h_elements[i] = T::rand_host();
-    return cudaMemcpy(d_elements, h_elements, sizeof(T) * n, cudaMemcpyHostToDevice);
+int device_populate_random(T* d_elements, unsigned n)
+{
+  T h_elements[n];
+  for (unsigned i = 0; i < n; i++)
+    h_elements[i] = T::rand_host();
+  return cudaMemcpy(d_elements, h_elements, sizeof(T) * n, cudaMemcpyHostToDevice);
 }

 template <class T>
-int device_set(T* d_elements, T el, unsigned n) {
-    T h_elements[n];
-    for (unsigned i = 0; i < n; i++)
-        h_elements[i] = el;
-    return cudaMemcpy(d_elements, h_elements, sizeof(T) * n, cudaMemcpyHostToDevice);
+int device_set(T* d_elements, T el, unsigned n)
+{
+  T h_elements[n];
+  for (unsigned i = 0; i < n; i++)
+    h_elements[i] = el;
+  return cudaMemcpy(d_elements, h_elements, sizeof(T) * n, cudaMemcpyHostToDevice);
 }

-mp::int1024_t convert_to_boost_mp(uint32_t *a, uint32_t length)
+mp::int1024_t convert_to_boost_mp(uint32_t* a, uint32_t length)
 {
  mp::int1024_t res = 0;
-  for (uint32_t i = 0; i < length; i++)
-  {
+  for (uint32_t i = 0; i < length; i++) {
    res += (mp::int1024_t)(a[i]) << 32 * i;
  }
  return res;
 }

-class PrimitivesTest : public ::testing::Test {
+class PrimitivesTest : public ::testing::Test
+{
 protected:
  static const unsigned n = 1 << 4;

-  projective_t *points1{};
-  projective_t *points2{};
-  g2_projective_t *g2_points1{};
-  g2_projective_t *g2_points2{};
-  scalar_field_t *scalars1{};
-  scalar_field_t *scalars2{};
-  projective_t *zero_points{};
-  g2_projective_t *g2_zero_points{};
-  scalar_field_t *zero_scalars{};
-  scalar_field_t *one_scalars{};
-  affine_t *aff_points{};
-  g2_affine_t *g2_aff_points{};
-  projective_t *res_points1{};
-  projective_t *res_points2{};
-  g2_projective_t *g2_res_points1{};
-  g2_projective_t *g2_res_points2{};
-  scalar_field_t *res_scalars1{};
-  scalar_field_t *res_scalars2{};
-  scalar_field_t::Wide *res_scalars_wide{};
-  scalar_field_t::Wide *res_scalars_wide_full{};
+  projective_t* points1{};
+  projective_t* points2{};
+  g2_projective_t* g2_points1{};
+  g2_projective_t* g2_points2{};
+  scalar_field_t* scalars1{};
+  scalar_field_t* scalars2{};
+  projective_t* zero_points{};
+  g2_projective_t* g2_zero_points{};
+  scalar_field_t* zero_scalars{};
+  scalar_field_t* one_scalars{};
+  affine_t* aff_points{};
+  g2_affine_t* g2_aff_points{};
+  projective_t* res_points1{};
+  projective_t* res_points2{};
+  g2_projective_t* g2_res_points1{};
+  g2_projective_t* g2_res_points2{};
+  scalar_field_t* res_scalars1{};
+  scalar_field_t* res_scalars2{};
+  scalar_field_t::Wide* res_scalars_wide{};
+  scalar_field_t::Wide* res_scalars_wide_full{};

-  PrimitivesTest() {
+  PrimitivesTest()
+  {
    assert(!cudaDeviceReset());
    assert(!cudaMallocManaged(&points1, n * sizeof(projective_t)));
    assert(!cudaMallocManaged(&points2, n * sizeof(projective_t)));
@@ -80,7 +83,8 @@ protected:
    assert(!cudaMallocManaged(&res_scalars_wide_full, n * sizeof(scalar_field_t::Wide)));
  }

-  ~PrimitivesTest() override {
+  ~PrimitivesTest() override
+  {
    cudaFree(points1);
    cudaFree(points2);
    cudaFree(g2_points1);
@@ -106,7 +110,8 @@ protected:
    cudaDeviceReset();
  }

-  void SetUp() override {
+  void SetUp() override
+  {
    ASSERT_EQ(device_populate_random<projective_t>(points1, n), cudaSuccess);
    ASSERT_EQ(device_populate_random<projective_t>(points2, n), cudaSuccess);
    ASSERT_EQ(device_populate_random<g2_projective_t>(g2_points1, n), cudaSuccess);
@@ -130,32 +135,37 @@ protected:
  }
 };

-TEST_F(PrimitivesTest, FieldAdditionSubtractionCancel) {
+TEST_F(PrimitivesTest, FieldAdditionSubtractionCancel)
+{
  ASSERT_EQ(vec_add(scalars1, scalars2, res_scalars1, n), cudaSuccess);
  ASSERT_EQ(vec_sub(res_scalars1, scalars2, res_scalars2, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(scalars1[i], res_scalars2[i]);
 }

-TEST_F(PrimitivesTest, FieldZeroAddition) {
+TEST_F(PrimitivesTest, FieldZeroAddition)
+{
  ASSERT_EQ(vec_add(scalars1, zero_scalars, res_scalars1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(scalars1[i], res_scalars1[i]);
 }

-TEST_F(PrimitivesTest, FieldAdditionHostDeviceEq) {
+TEST_F(PrimitivesTest, FieldAdditionHostDeviceEq)
+{
  ASSERT_EQ(vec_add(scalars1, scalars2, res_scalars1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(scalars1[i] + scalars2[i], res_scalars1[i]);
 }

-TEST_F(PrimitivesTest, FieldMultiplicationByOne) {
+TEST_F(PrimitivesTest, FieldMultiplicationByOne)
+{
  ASSERT_EQ(vec_mul(scalars1, one_scalars, res_scalars1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(scalars1[i], res_scalars1[i]);
 }

-TEST_F(PrimitivesTest, FieldMultiplicationByMinusOne) {
+TEST_F(PrimitivesTest, FieldMultiplicationByMinusOne)
+{
  ASSERT_EQ(vec_neg(one_scalars, res_scalars1, n), cudaSuccess);
  ASSERT_EQ(vec_mul(scalars1, res_scalars1, res_scalars2, n), cudaSuccess);
  ASSERT_EQ(vec_add(scalars1, res_scalars2, res_scalars1, n), cudaSuccess);
@@ -163,82 +173,95 @@ TEST_F(PrimitivesTest, FieldMultiplicationByMinusOne) {
    ASSERT_EQ(res_scalars1[i], zero_scalars[i]);
 }

-TEST_F(PrimitivesTest, FieldMultiplicationByZero) {
+TEST_F(PrimitivesTest, FieldMultiplicationByZero)
+{
  ASSERT_EQ(vec_mul(scalars1, zero_scalars, res_scalars1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(zero_scalars[i], res_scalars1[i]);
 }

-TEST_F(PrimitivesTest, FieldMultiplicationInverseCancel) {
+TEST_F(PrimitivesTest, FieldMultiplicationInverseCancel)
+{
  ASSERT_EQ(vec_mul(scalars1, scalars2, res_scalars1, n), cudaSuccess);
  ASSERT_EQ(field_vec_inv(scalars2, res_scalars2, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(scalars1[i], res_scalars1[i] * res_scalars2[i]);
 }

-TEST_F(PrimitivesTest, FieldMultiplicationHostDeviceEq) {
+TEST_F(PrimitivesTest, FieldMultiplicationHostDeviceEq)
+{
  ASSERT_EQ(vec_mul(scalars1, scalars2, res_scalars1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(scalars1[i] * scalars2[i], res_scalars1[i]);
 }

-TEST_F(PrimitivesTest, FieldMultiplicationByTwoEqSum) {
+TEST_F(PrimitivesTest, FieldMultiplicationByTwoEqSum)
+{
  ASSERT_EQ(vec_add(one_scalars, one_scalars, res_scalars1, n), cudaSuccess);
  ASSERT_EQ(vec_mul(res_scalars1, scalars1, res_scalars2, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(res_scalars2[i], scalars1[i] + scalars1[i]);
 }

-TEST_F(PrimitivesTest, FieldSqrHostDeviceEq) {
+TEST_F(PrimitivesTest, FieldSqrHostDeviceEq)
+{
  ASSERT_EQ(field_vec_sqr(scalars1, res_scalars1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(scalars1[i] * scalars1[i], res_scalars1[i]);
 }

-TEST_F(PrimitivesTest, FieldMultiplicationSqrEq) {
+TEST_F(PrimitivesTest, FieldMultiplicationSqrEq)
+{
  ASSERT_EQ(vec_mul(scalars1, scalars1, res_scalars1, n), cudaSuccess);
  ASSERT_EQ(field_vec_sqr(scalars1, res_scalars2, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(res_scalars1[i], res_scalars2[i]);
 }

-TEST_F(PrimitivesTest, ECRandomPointsAreOnCurve) {
+TEST_F(PrimitivesTest, ECRandomPointsAreOnCurve)
+{
  for (unsigned i = 0; i < n; i++)
    ASSERT_PRED1(projective_t::is_on_curve, points1[i]);
 }

-TEST_F(PrimitivesTest, ECPointAdditionSubtractionCancel) {
+TEST_F(PrimitivesTest, ECPointAdditionSubtractionCancel)
+{
  ASSERT_EQ(vec_add(points1, points2, res_points1, n), cudaSuccess);
  ASSERT_EQ(vec_sub(res_points1, points2, res_points2, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(points1[i], res_points2[i]);
 }

-TEST_F(PrimitivesTest, ECPointZeroAddition) {
+TEST_F(PrimitivesTest, ECPointZeroAddition)
+{
  ASSERT_EQ(vec_add(points1, zero_points, res_points1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(points1[i], res_points1[i]);
 }

-TEST_F(PrimitivesTest, ECPointAdditionHostDeviceEq) {
+TEST_F(PrimitivesTest, ECPointAdditionHostDeviceEq)
+{
  ASSERT_EQ(vec_add(points1, points2, res_points1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(points1[i] + points2[i], res_points1[i]);
 }

-TEST_F(PrimitivesTest, ECScalarMultiplicationHostDeviceEq) {
+TEST_F(PrimitivesTest, ECScalarMultiplicationHostDeviceEq)
+{
  ASSERT_EQ(vec_mul(scalars1, points1, res_points1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(scalars1[i] * points1[i], res_points1[i]);
 }

-TEST_F(PrimitivesTest, ECScalarMultiplicationByOne) {
+TEST_F(PrimitivesTest, ECScalarMultiplicationByOne)
+{
  ASSERT_EQ(vec_mul(one_scalars, points1, res_points1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(points1[i], res_points1[i]);
 }

-TEST_F(PrimitivesTest, ECScalarMultiplicationByMinusOne) {
+TEST_F(PrimitivesTest, ECScalarMultiplicationByMinusOne)
+{
  ASSERT_EQ(vec_neg(one_scalars, res_scalars1, n), cudaSuccess);
  ASSERT_EQ(vec_mul(res_scalars1, points1, res_points1, n), cudaSuccess);
  ASSERT_EQ(vec_neg(points1, res_points2, n), cudaSuccess);
@@ -246,14 +269,16 @@ TEST_F(PrimitivesTest, ECScalarMultiplicationByMinusOne) {
    ASSERT_EQ(res_points1[i], res_points2[i]);
 }

-TEST_F(PrimitivesTest, ECScalarMultiplicationByTwo) {
+TEST_F(PrimitivesTest, ECScalarMultiplicationByTwo)
+{
  ASSERT_EQ(vec_add(one_scalars, one_scalars, res_scalars1, n), cudaSuccess);
  ASSERT_EQ(vec_mul(res_scalars1, points1, res_points1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ((one_scalars[i] + one_scalars[i]) * points1[i], res_points1[i]);
 }

-TEST_F(PrimitivesTest, ECScalarMultiplicationInverseCancel) {
+TEST_F(PrimitivesTest, ECScalarMultiplicationInverseCancel)
+{
  ASSERT_EQ(vec_mul(scalars1, points1, res_points1, n), cudaSuccess);
  ASSERT_EQ(field_vec_inv(scalars1, res_scalars1, n), cudaSuccess);
  ASSERT_EQ(vec_mul(res_scalars1, res_points1, res_points2, n), cudaSuccess);
@@ -261,7 +286,8 @@ TEST_F(PrimitivesTest, ECScalarMultiplicationInverseCancel) {
    ASSERT_EQ(points1[i], res_points2[i]);
 }

-TEST_F(PrimitivesTest, ECScalarMultiplicationIsDistributiveOverMultiplication) {
+TEST_F(PrimitivesTest, ECScalarMultiplicationIsDistributiveOverMultiplication)
+{
  ASSERT_EQ(vec_mul(scalars1, points1, res_points1, n), cudaSuccess);
  ASSERT_EQ(vec_mul(scalars2, res_points1, res_points2, n), cudaSuccess);
  ASSERT_EQ(vec_mul(scalars1, scalars2, res_scalars1, n), cudaSuccess);
@@ -270,7 +296,8 @@ TEST_F(PrimitivesTest, ECScalarMultiplicationIsDistributiveOverMultiplication) {
    ASSERT_EQ(res_points1[i], res_points2[i]);
 }

-TEST_F(PrimitivesTest, ECScalarMultiplicationIsDistributiveOverAddition) {
+TEST_F(PrimitivesTest, ECScalarMultiplicationIsDistributiveOverAddition)
+{
  ASSERT_EQ(vec_mul(scalars1, points1, res_points1, n), cudaSuccess);
  ASSERT_EQ(vec_mul(scalars2, points1, res_points2, n), cudaSuccess);
  ASSERT_EQ(vec_add(scalars1, scalars2, res_scalars1, n), cudaSuccess);
@@ -278,13 +305,15 @@ TEST_F(PrimitivesTest, ECScalarMultiplicationIsDistributiveOverAddition) {
    ASSERT_EQ(res_scalars1[i] * points1[i], res_points1[i] + res_points2[i]);
 }

-TEST_F(PrimitivesTest, ECProjectiveToAffine) {
+TEST_F(PrimitivesTest, ECProjectiveToAffine)
+{
  ASSERT_EQ(point_vec_to_affine(points1, aff_points, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(points1[i], projective_t::from_affine(aff_points[i]));
 }

-TEST_F(PrimitivesTest, ECMixedPointAddition) {
+TEST_F(PrimitivesTest, ECMixedPointAddition)
+{
  ASSERT_EQ(point_vec_to_affine(points2, aff_points, n), cudaSuccess);
  ASSERT_EQ(vec_add(points1, aff_points, res_points1, n), cudaSuccess);
  ASSERT_EQ(vec_add(points1, points2, res_points2, n), cudaSuccess);
@@ -292,7 +321,8 @@ TEST_F(PrimitivesTest, ECMixedPointAddition) {
    ASSERT_EQ(res_points1[i], res_points2[i]);
 }

-TEST_F(PrimitivesTest, ECMixedAdditionOfNegatedPointEqSubtraction) {
+TEST_F(PrimitivesTest, ECMixedAdditionOfNegatedPointEqSubtraction)
+{
  ASSERT_EQ(point_vec_to_affine(points2, aff_points, n), cudaSuccess);
  ASSERT_EQ(vec_sub(points1, aff_points, res_points1, n), cudaSuccess);
  ASSERT_EQ(vec_neg(points2, res_points2, n), cudaSuccess);
@@ -300,117 +330,100 @@ TEST_F(PrimitivesTest, ECMixedAdditionOfNegatedPointEqSubtraction) {
    ASSERT_EQ(res_points1[i], points1[i] + res_points2[i]);
 }

-TEST_F(PrimitivesTest, MP_LSB_MULT) {
+TEST_F(PrimitivesTest, MP_LSB_MULT)
+{
  // LSB multiply, check correctness of first TLC + 1 digits result.
  ASSERT_EQ(mp_lsb_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
  std::cout << "first GPU lsb mult output  = 0x";
-  for (int i=0; i<2*scalar_field_t::TLC; i++)
-  {
+  for (int i = 0; i < 2 * scalar_field_t::TLC; i++) {
    std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i];
  }
  std::cout << std::endl;

-
  ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
  std::cout << "first GPU full mult output = 0x";
-  for (int i=0; i<2*scalar_field_t::TLC; i++)
-  {
+  for (int i = 0; i < 2 * scalar_field_t::TLC; i++) {
    std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i];
  }
  std::cout << std::endl;
-  for (int j = 0; j < n; j++)
-  {
-    for (int i=0; i<scalar_field_t::TLC + 1; i++)
-    {
+  for (int j = 0; j < n; j++) {
+    for (int i = 0; i < scalar_field_t::TLC + 1; i++) {
      ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]);
    }
  }
 }

-TEST_F(PrimitivesTest, MP_MSB_MULT) {
+TEST_F(PrimitivesTest, MP_MSB_MULT)
+{
  // MSB multiply, take n msb bits of multiplication, assert that the error is up to 1.
  ASSERT_EQ(mp_msb_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
  std::cout << "first GPU msb mult output  = 0x";
-  for (int i=2*scalar_field_t::TLC - 1; i >=0 ; i--)
-  {
+  for (int i = 2 * scalar_field_t::TLC - 1; i >= 0; i--) {
    std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " ";
  }
  std::cout << std::endl;

-
  ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
  std::cout << "first GPU full mult output = 0x";
-  for (int i=2*scalar_field_t::TLC - 1; i >=0 ; i--)
-  {
+  for (int i = 2 * scalar_field_t::TLC - 1; i >= 0; i--) {
    std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " ";
  }

  std::cout << std::endl;

-  for (int i=0; i < 2*scalar_field_t::TLC - 1; i++)
-  {
+  for (int i = 0; i < 2 * scalar_field_t::TLC - 1; i++) {
    if (res_scalars_wide_full[0].limbs_storage.limbs[i] == res_scalars_wide[0].limbs_storage.limbs[i])
-        std::cout << "matched word idx = " << i << std::endl;
+      std::cout << "matched word idx = " << i << std::endl;
  }
-
 }

-TEST_F(PrimitivesTest, INGO_MP_MULT) {
+TEST_F(PrimitivesTest, INGO_MP_MULT)
+{
  // MSB multiply, take n msb bits of multiplication, assert that the error is up to 1.
  ASSERT_EQ(ingo_mp_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
  std::cout << "INGO   = 0x";
-  for (int i=0; i < 2*scalar_field_t::TLC ; i++)
-  {
+  for (int i = 0; i < 2 * scalar_field_t::TLC; i++) {
    std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " ";
  }
  std::cout << std::endl;

-
  ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
  std::cout << "ZKSYNC = 0x";
-  for (int i=0; i < 2*scalar_field_t::TLC ; i++)
-  {
+  for (int i = 0; i < 2 * scalar_field_t::TLC; i++) {
    std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " ";
  }

  std::cout << std::endl;

-  for (int i=0; i < 2*scalar_field_t::TLC - 1; i++)
-  {
+  for (int i = 0; i < 2 * scalar_field_t::TLC - 1; i++) {
    if (res_scalars_wide_full[0].limbs_storage.limbs[i] == res_scalars_wide[0].limbs_storage.limbs[i])
-        std::cout << "matched word idx = " << i << std::endl;
+      std::cout << "matched word idx = " << i << std::endl;
  }
-  for (int j=0; j<n; j++)
-  {
-    for (int i=0; i < 2*scalar_field_t::TLC - 1; i++)
-    {
+  for (int j = 0; j < n; j++) {
+    for (int i = 0; i < 2 * scalar_field_t::TLC - 1; i++) {
      ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]);
    }
  }
-
 }

-
-TEST_F(PrimitivesTest, INGO_MP_MSB_MULT) {
+TEST_F(PrimitivesTest, INGO_MP_MSB_MULT)
+{
  // MSB multiply, take n msb bits of multiplication, assert that the error is up to 1.
  ASSERT_EQ(ingo_mp_msb_mult(scalars1, scalars2, res_scalars_wide, n), cudaSuccess);
  std::cout << "INGO MSB   = 0x";
-  for (int i=2*scalar_field_t::TLC - 1; i >= 0  ; i--)
-  {
+  for (int i = 2 * scalar_field_t::TLC - 1; i >= 0; i--) {
    std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " ";
  }
  std::cout << std::endl;

  ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
  std::cout << "ZKSYNC = 0x";
-  for (int i=2*scalar_field_t::TLC - 1; i >= 0  ; i--)
-  {
+  for (int i = 2 * scalar_field_t::TLC - 1; i >= 0; i--) {
    std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " ";
  }

  std::cout << std::endl;
-  
-  
+
  // for (int i=scalar_field::TLC; i < 2*scalar_field::TLC - 1; i++)
  // {
  //   ASSERT_EQ(in_bound, true);
@@ -428,9 +441,8 @@ TEST_F(PrimitivesTest, INGO_MP_MSB_MULT) {
  mp::int1024_t res_mp = 0;
  mp::int1024_t res_gpu = 0;
  uint32_t num_limbs = scalar_field_t::TLC;
-  
-  for (int j=0; j<n; j++)
-  {
+
+  for (int j = 0; j < n; j++) {
    uint32_t* scalar1_limbs = scalars1[j].limbs_storage.limbs;
    uint32_t* scalar2_limbs = scalars2[j].limbs_storage.limbs;
    scalar_1_mp = convert_to_boost_mp(scalar1_limbs, num_limbs);
@@ -438,24 +450,24 @@ TEST_F(PrimitivesTest, INGO_MP_MSB_MULT) {
    res_mp = scalar_1_mp * scalar_2_mp;
    res_mp = res_mp >> (num_limbs * 32);
    res_gpu = convert_to_boost_mp(&(res_scalars_wide[j]).limbs_storage.limbs[num_limbs], num_limbs);
-    std::cout  << "res  mp = " << res_mp << std::endl;
+    std::cout << "res  mp = " << res_mp << std::endl;
    std::cout << "res gpu = " << res_gpu << std::endl;
    std::cout << "error = " << res_mp - res_gpu << std::endl;
    bool upper_bound = res_gpu <= res_mp;
    bool lower_bound = res_gpu > (res_mp - num_limbs);
    bool in_bound = upper_bound && lower_bound;
-    
-    
+
    ASSERT_EQ(in_bound, true);
  }
 }

-TEST_F(PrimitivesTest, INGO_MP_MOD_MULT) {
-  std::cout  << " taking num limbs " <<  std::endl;
+TEST_F(PrimitivesTest, INGO_MP_MOD_MULT)
+{
+  std::cout << " taking num limbs " << std::endl;
  uint32_t num_limbs = scalar_field_t::TLC;
-  std::cout  << " calling gpu... = " <<  std::endl;
+  std::cout << " calling gpu... = " << std::endl;
  ASSERT_EQ(ingo_mp_mod_mult(scalars1, scalars2, res_scalars1, n), cudaSuccess);
-  std::cout  << " gpu call done " <<  std::endl;
+  std::cout << " gpu call done " << std::endl;
  // mp testing
  mp::int1024_t scalar_1_mp = 0;
  mp::int1024_t scalar_2_mp = 0;
@@ -463,10 +475,8 @@ TEST_F(PrimitivesTest, INGO_MP_MOD_MULT) {
  mp::int1024_t res_gpu = 0;
  mp::int1024_t p = convert_to_boost_mp(scalar_field_t::get_modulus().limbs, num_limbs);
  std::cout << " p = " << p << std::endl;
-  
-  
-  for (int j=0; j<n; j++)
-  {
+
+  for (int j = 0; j < n; j++) {
    uint32_t* scalar1_limbs = scalars1[j].limbs_storage.limbs;
    uint32_t* scalar2_limbs = scalars2[j].limbs_storage.limbs;
    scalar_1_mp = convert_to_boost_mp(scalar1_limbs, num_limbs);
@@ -475,51 +485,57 @@ TEST_F(PrimitivesTest, INGO_MP_MOD_MULT) {
    // std::cout << " s2 = " << scalar_2_mp << std::endl;
    res_mp = (scalar_1_mp * scalar_2_mp) % p;
    res_gpu = convert_to_boost_mp((res_scalars1[j]).limbs_storage.limbs, num_limbs);
-    std::cout  << "res  mp = " << res_mp << std::endl;
+    std::cout << "res  mp = " << res_mp << std::endl;
    std::cout << "res gpu = " << res_gpu << std::endl;
    std::cout << "error = " << res_mp - res_gpu << std::endl;
    ASSERT_EQ(res_gpu, res_mp);
  }
 }

-
-TEST_F(PrimitivesTest, G2ECRandomPointsAreOnCurve) {
+TEST_F(PrimitivesTest, G2ECRandomPointsAreOnCurve)
+{
  for (unsigned i = 0; i < n; i++)
    ASSERT_PRED1(g2_projective_t::is_on_curve, g2_points1[i]);
 }

-TEST_F(PrimitivesTest, G2ECPointAdditionSubtractionCancel) {
+TEST_F(PrimitivesTest, G2ECPointAdditionSubtractionCancel)
+{
  ASSERT_EQ(vec_add(g2_points1, g2_points2, g2_res_points1, n), cudaSuccess);
  ASSERT_EQ(vec_sub(g2_res_points1, g2_points2, g2_res_points2, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(g2_points1[i], g2_res_points2[i]);
 }

-TEST_F(PrimitivesTest, G2ECPointZeroAddition) {
+TEST_F(PrimitivesTest, G2ECPointZeroAddition)
+{
  ASSERT_EQ(vec_add(g2_points1, g2_zero_points, g2_res_points1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(g2_points1[i], g2_res_points1[i]);
 }

-TEST_F(PrimitivesTest, G2ECPointAdditionHostDeviceEq) {
+TEST_F(PrimitivesTest, G2ECPointAdditionHostDeviceEq)
+{
  ASSERT_EQ(vec_add(g2_points1, g2_points2, g2_res_points1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(g2_points1[i] + g2_points2[i], g2_res_points1[i]);
 }

-TEST_F(PrimitivesTest, G2ECScalarMultiplicationHostDeviceEq) {
+TEST_F(PrimitivesTest, G2ECScalarMultiplicationHostDeviceEq)
+{
  ASSERT_EQ(vec_mul(scalars1, g2_points1, g2_res_points1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(scalars1[i] * g2_points1[i], g2_res_points1[i]);
 }

-TEST_F(PrimitivesTest, G2ECScalarMultiplicationByOne) {
+TEST_F(PrimitivesTest, G2ECScalarMultiplicationByOne)
+{
  ASSERT_EQ(vec_mul(one_scalars, points1, res_points1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(g2_points1[i], g2_res_points1[i]);
 }

-TEST_F(PrimitivesTest, G2ECScalarMultiplicationByMinusOne) {
+TEST_F(PrimitivesTest, G2ECScalarMultiplicationByMinusOne)
+{
  ASSERT_EQ(vec_neg(one_scalars, res_scalars1, n), cudaSuccess);
  ASSERT_EQ(vec_mul(res_scalars1, g2_points1, g2_res_points1, n), cudaSuccess);
  ASSERT_EQ(vec_neg(g2_points1, g2_res_points2, n), cudaSuccess);
@@ -527,14 +543,16 @@ TEST_F(PrimitivesTest, G2ECScalarMultiplicationByMinusOne) {
    ASSERT_EQ(g2_res_points1[i], g2_res_points2[i]);
 }

-TEST_F(PrimitivesTest, G2ECScalarMultiplicationByTwo) {
+TEST_F(PrimitivesTest, G2ECScalarMultiplicationByTwo)
+{
  ASSERT_EQ(vec_add(one_scalars, one_scalars, res_scalars1, n), cudaSuccess);
  ASSERT_EQ(vec_mul(res_scalars1, g2_points1, g2_res_points1, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ((one_scalars[i] + one_scalars[i]) * g2_points1[i], g2_res_points1[i]);
 }

-TEST_F(PrimitivesTest, G2ECScalarMultiplicationInverseCancel) {
+TEST_F(PrimitivesTest, G2ECScalarMultiplicationInverseCancel)
+{
  ASSERT_EQ(vec_mul(scalars1, g2_points1, g2_res_points1, n), cudaSuccess);
  ASSERT_EQ(field_vec_inv(scalars1, res_scalars1, n), cudaSuccess);
  ASSERT_EQ(vec_mul(res_scalars1, g2_res_points1, g2_res_points2, n), cudaSuccess);
@@ -542,7 +560,8 @@ TEST_F(PrimitivesTest, G2ECScalarMultiplicationInverseCancel) {
    ASSERT_EQ(g2_points1[i], g2_res_points2[i]);
 }

-TEST_F(PrimitivesTest, G2ECScalarMultiplicationIsDistributiveOverMultiplication) {
+TEST_F(PrimitivesTest, G2ECScalarMultiplicationIsDistributiveOverMultiplication)
+{
  ASSERT_EQ(vec_mul(scalars1, g2_points1, g2_res_points1, n), cudaSuccess);
  ASSERT_EQ(vec_mul(scalars2, g2_res_points1, g2_res_points2, n), cudaSuccess);
  ASSERT_EQ(vec_mul(scalars1, scalars2, res_scalars1, n), cudaSuccess);
@@ -551,7 +570,8 @@ TEST_F(PrimitivesTest, G2ECScalarMultiplicationIsDistributiveOverMultiplication)
    ASSERT_EQ(g2_res_points1[i], g2_res_points2[i]);
 }

-TEST_F(PrimitivesTest, G2ECScalarMultiplicationIsDistributiveOverAddition) {
+TEST_F(PrimitivesTest, G2ECScalarMultiplicationIsDistributiveOverAddition)
+{
  ASSERT_EQ(vec_mul(scalars1, g2_points1, g2_res_points1, n), cudaSuccess);
  ASSERT_EQ(vec_mul(scalars2, g2_points1, g2_res_points2, n), cudaSuccess);
  ASSERT_EQ(vec_add(scalars1, scalars2, res_scalars1, n), cudaSuccess);
@@ -559,13 +579,15 @@ TEST_F(PrimitivesTest, G2ECScalarMultiplicationIsDistributiveOverAddition) {
    ASSERT_EQ(res_scalars1[i] * g2_points1[i], g2_res_points1[i] + g2_res_points2[i]);
 }

-TEST_F(PrimitivesTest, G2ECProjectiveToAffine) {
+TEST_F(PrimitivesTest, G2ECProjectiveToAffine)
+{
  ASSERT_EQ(point_vec_to_affine(g2_points1, g2_aff_points, n), cudaSuccess);
  for (unsigned i = 0; i < n; i++)
    ASSERT_EQ(g2_points1[i], g2_projective_t::from_affine(g2_aff_points[i]));
 }

-TEST_F(PrimitivesTest, G2ECMixedPointAddition) {
+TEST_F(PrimitivesTest, G2ECMixedPointAddition)
+{
  ASSERT_EQ(point_vec_to_affine(g2_points2, g2_aff_points, n), cudaSuccess);
  ASSERT_EQ(vec_add(g2_points1, g2_aff_points, g2_res_points1, n), cudaSuccess);
  ASSERT_EQ(vec_add(g2_points1, g2_points2, g2_res_points2, n), cudaSuccess);
@@ -573,7 +595,8 @@ TEST_F(PrimitivesTest, G2ECMixedPointAddition) {
    ASSERT_EQ(g2_res_points1[i], g2_res_points2[i]);
 }

-TEST_F(PrimitivesTest, G2ECMixedAdditionOfNegatedPointEqSubtraction) {
+TEST_F(PrimitivesTest, G2ECMixedAdditionOfNegatedPointEqSubtraction)
+{
  ASSERT_EQ(point_vec_to_affine(g2_points2, g2_aff_points, n), cudaSuccess);
  ASSERT_EQ(vec_sub(g2_points1, g2_aff_points, g2_res_points1, n), cudaSuccess);
  ASSERT_EQ(vec_neg(g2_points2, g2_res_points2, n), cudaSuccess);
@@ -581,7 +604,8 @@ TEST_F(PrimitivesTest, G2ECMixedAdditionOfNegatedPointEqSubtraction) {
    ASSERT_EQ(g2_res_points1[i], g2_points1[i] + g2_res_points2[i]);
 }

-int main(int argc, char **argv) {
+int main(int argc, char** argv)
+{
  ::testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();
 }
--- a/icicle/primitives/test_kernels.cuh
+++ b/icicle/primitives/test_kernels.cuh
@@ -5,189 +5,195 @@

 // TODO: change the curve depending on env variable
 #include "../curves/bn254/curve_config.cuh"
-#include "projective.cuh"
 #include "extension_field.cuh"
+#include "projective.cuh"

 #endif

 using namespace BN254;

 template <class T1, class T2>
-__global__ void add_elements_kernel(const T1 *x, const T2 *y, T1 *result, const unsigned count) {
+__global__ void add_elements_kernel(const T1* x, const T2* y, T1* result, const unsigned count)
+{
  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= count)
-    return;
+  if (gid >= count) return;
  result[gid] = x[gid] + y[gid];
 }

-template <class T1, class T2> int vec_add(const T1 *x, const T2 *y, T1 *result, const unsigned count) {
+template <class T1, class T2>
+int vec_add(const T1* x, const T2* y, T1* result, const unsigned count)
+{
  add_elements_kernel<T1, T2><<<(count - 1) / 32 + 1, 32>>>(x, y, result, count);
  int error = cudaGetLastError();
  return error ? error : cudaDeviceSynchronize();
 }

 template <class T1, class T2>
-__global__ void sub_elements_kernel(const T1 *x, const T2 *y, T1 *result, const unsigned count) {
+__global__ void sub_elements_kernel(const T1* x, const T2* y, T1* result, const unsigned count)
+{
  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= count)
-    return;
+  if (gid >= count) return;
  result[gid] = x[gid] - y[gid];
 }

-template <class T1, class T2> int vec_sub(const T1 *x, const T2 *y, T1 *result, const unsigned count) {
+template <class T1, class T2>
+int vec_sub(const T1* x, const T2* y, T1* result, const unsigned count)
+{
  sub_elements_kernel<T1, T2><<<(count - 1) / 32 + 1, 32>>>(x, y, result, count);
  int error = cudaGetLastError();
  return error ? error : cudaDeviceSynchronize();
 }

 template <class T>
-__global__ void neg_elements_kernel(const T *x, T *result, const unsigned count) {
+__global__ void neg_elements_kernel(const T* x, T* result, const unsigned count)
+{
  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= count)
-    return;
+  if (gid >= count) return;
  result[gid] = T::neg(x[gid]);
 }

-template <class T> int vec_neg(const T *x, T *result, const unsigned count) {
+template <class T>
+int vec_neg(const T* x, T* result, const unsigned count)
+{
  neg_elements_kernel<T><<<(count - 1) / 32 + 1, 32>>>(x, result, count);
  int error = cudaGetLastError();
  return error ? error : cudaDeviceSynchronize();
 }

 template <class F, class G>
-__global__ void mul_elements_kernel(const F *x, const G *y, G *result, const unsigned count) {
+__global__ void mul_elements_kernel(const F* x, const G* y, G* result, const unsigned count)
+{
  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= count)
-    return;
+  if (gid >= count) return;
  result[gid] = x[gid] * y[gid];
 }

-template <class F, class G> int vec_mul(const F *x, const G *y, G *result, const unsigned count) {
+template <class F, class G>
+int vec_mul(const F* x, const G* y, G* result, const unsigned count)
+{
  mul_elements_kernel<F, G><<<(count - 1) / 32 + 1, 32>>>(x, y, result, count);
  int error = cudaGetLastError();
  return error ? error : cudaDeviceSynchronize();
 }

-__global__ void inv_field_elements_kernel(const scalar_field_t *x, scalar_field_t *result, const unsigned count) {
+__global__ void inv_field_elements_kernel(const scalar_field_t* x, scalar_field_t* result, const unsigned count)
+{
  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= count)
-    return;
+  if (gid >= count) return;
  result[gid] = scalar_field_t::inverse(x[gid]);
 }

-int field_vec_inv(const scalar_field_t *x, scalar_field_t *result, const unsigned count) {
+int field_vec_inv(const scalar_field_t* x, scalar_field_t* result, const unsigned count)
+{
  inv_field_elements_kernel<<<(count - 1) / 32 + 1, 32>>>(x, result, count);
  int error = cudaGetLastError();
  return error ? error : cudaDeviceSynchronize();
 }

-__global__ void sqr_field_elements_kernel(const scalar_field_t *x, scalar_field_t *result, const unsigned count) {
+__global__ void sqr_field_elements_kernel(const scalar_field_t* x, scalar_field_t* result, const unsigned count)
+{
  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= count)
-    return;
+  if (gid >= count) return;
  result[gid] = scalar_field_t::sqr(x[gid]);
 }

-int field_vec_sqr(const scalar_field_t *x, scalar_field_t *result, const unsigned count) {
+int field_vec_sqr(const scalar_field_t* x, scalar_field_t* result, const unsigned count)
+{
  sqr_field_elements_kernel<<<(count - 1) / 32 + 1, 32>>>(x, result, count);
  int error = cudaGetLastError();
  return error ? error : cudaDeviceSynchronize();
 }

 template <class P, class A>
-__global__ void to_affine_points_kernel(const P *x, A *result, const unsigned count) {
+__global__ void to_affine_points_kernel(const P* x, A* result, const unsigned count)
+{
  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= count)
-    return;
+  if (gid >= count) return;
  result[gid] = P::to_affine(x[gid]);
 }

-template <class P, class A> int point_vec_to_affine(const P *x, A *result, const unsigned count) {
+template <class P, class A>
+int point_vec_to_affine(const P* x, A* result, const unsigned count)
+{
  to_affine_points_kernel<P, A><<<(count - 1) / 32 + 1, 32>>>(x, result, count);
  int error = cudaGetLastError();
  return error ? error : cudaDeviceSynchronize();
 }

-
-__global__ void mp_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) {
+__global__ void mp_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result)
+{
  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
  scalar_field_t::multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
 }

-
-int mp_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result)
+int mp_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result)
 {
  mp_mult_kernel<<<1, 32>>>(x, y, result);
  int error = cudaGetLastError();
-  return error ? error :  cudaDeviceSynchronize();
+  return error ? error : cudaDeviceSynchronize();
 }

-
-
-__global__ void mp_lsb_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) {
+__global__ void mp_lsb_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result)
+{
  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
  scalar_field_t::multiply_lsb_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
 }

-
-int mp_lsb_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result)
+int mp_lsb_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result)
 {
  mp_lsb_mult_kernel<<<1, 32>>>(x, y, result);
  int error = cudaGetLastError();
-  return error ? error :  cudaDeviceSynchronize();
+  return error ? error : cudaDeviceSynchronize();
 }

-__global__ void mp_msb_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) {
+__global__ void mp_msb_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result)
+{
  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
  scalar_field_t::multiply_msb_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
 }

-
-int mp_msb_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result)
+int mp_msb_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result)
 {
  mp_msb_mult_kernel<<<1, 1>>>(x, y, result);
  int error = cudaGetLastError();
-  return error ? error :  cudaDeviceSynchronize();
+  return error ? error : cudaDeviceSynchronize();
 }

-
-__global__ void ingo_mp_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) {
+__global__ void ingo_mp_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result)
+{
  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
  scalar_field_t::ingo_multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
 }

-
-int ingo_mp_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result)
+int ingo_mp_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result)
 {
  ingo_mp_mult_kernel<<<1, 32>>>(x, y, result);
  int error = cudaGetLastError();
-  return error ? error :  cudaDeviceSynchronize();
+  return error ? error : cudaDeviceSynchronize();
 }

-
-__global__ void ingo_mp_msb_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) {
+__global__ void ingo_mp_msb_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t::Wide* result)
+{
  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
  scalar_field_t::ingo_msb_multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
 }

-
-int ingo_mp_msb_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result, const unsigned n)
+int ingo_mp_msb_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t::Wide* result, const unsigned n)
 {
  ingo_mp_msb_mult_kernel<<<1, n>>>(x, y, result);
  int error = cudaGetLastError();
-  return error ? error :  cudaDeviceSynchronize();
+  return error ? error : cudaDeviceSynchronize();
 }

-
-__global__ void ingo_mp_mod_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t *result) {
+__global__ void ingo_mp_mod_mult_kernel(const scalar_field_t* x, const scalar_field_t* y, scalar_field_t* result)
+{
  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
  result[gid] = x[gid] * y[gid];
 }

-
-int ingo_mp_mod_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t *result, const unsigned n)
+int ingo_mp_mod_mult(const scalar_field_t* x, scalar_field_t* y, scalar_field_t* result, const unsigned n)
 {
  ingo_mp_mod_mult_kernel<<<1, n>>>(x, y, result);
  int error = cudaGetLastError();
-  return error ? error :  cudaDeviceSynchronize();
+  return error ? error : cudaDeviceSynchronize();
 }
--- a/icicle/utils/cuda_utils.cuh
+++ b/icicle/utils/cuda_utils.cuh
@@ -2,39 +2,30 @@
 #include <cuda_runtime.h>

 struct cuda_ctx {
-    int device_id;
-    cudaMemPool_t mempool;
-    cudaStream_t stream;
+  int device_id;
+  cudaMemPool_t mempool;
+  cudaStream_t stream;

-    cuda_ctx(int gpu_id) {
-        gpu_id = gpu_id;
-        cudaMemPoolProps pool_props;
-        pool_props.allocType = cudaMemAllocationTypePinned;
-        pool_props.handleTypes = cudaMemHandleTypePosixFileDescriptor;
-        pool_props.location.type = cudaMemLocationTypeDevice;
-        pool_props.location.id = device_id;
+  cuda_ctx(int gpu_id)
+  {
+    gpu_id = gpu_id;
+    cudaMemPoolProps pool_props;
+    pool_props.allocType = cudaMemAllocationTypePinned;
+    pool_props.handleTypes = cudaMemHandleTypePosixFileDescriptor;
+    pool_props.location.type = cudaMemLocationTypeDevice;
+    pool_props.location.id = device_id;

-        cudaMemPoolCreate(&mempool, &pool_props);
-        cudaStreamCreate(&stream);
-    }
+    cudaMemPoolCreate(&mempool, &pool_props);
+    cudaStreamCreate(&stream);
+  }

-    void set_device() {
-        cudaSetDevice(device_id);
-    }
+  void set_device() { cudaSetDevice(device_id); }

-    void sync_stream() {
-        cudaStreamSynchronize(stream);
-    }
-
-    void malloc(void *ptr, size_t bytesize) {
-        cudaMallocFromPoolAsync(&ptr, bytesize, mempool, stream);
-    }
-
-    void free(void *ptr) {
-        cudaFreeAsync(ptr, stream);
-    }
+  void sync_stream() { cudaStreamSynchronize(stream); }

+  void malloc(void* ptr, size_t bytesize) { cudaMallocFromPoolAsync(&ptr, bytesize, mempool, stream); }

+  void free(void* ptr) { cudaFreeAsync(ptr, stream); }
 };

 // -- Proposed Function Tops --------------------------------------------------
--- a/icicle/utils/host_math.cuh
+++ b/icicle/utils/host_math.cuh
@@ -5,85 +5,92 @@

 namespace host_math {

-// return x + y with uint32_t operands
-static __host__ uint32_t add(const uint32_t x, const uint32_t y) { return x + y; }
+  // return x + y with uint32_t operands
+  static __host__ uint32_t add(const uint32_t x, const uint32_t y) { return x + y; }

-// return x + y + carry with uint32_t operands
-static __host__ uint32_t addc(const uint32_t x, const uint32_t y, const uint32_t carry) { return x + y + carry; }
+  // return x + y + carry with uint32_t operands
+  static __host__ uint32_t addc(const uint32_t x, const uint32_t y, const uint32_t carry) { return x + y + carry; }

-// return x + y and carry out with uint32_t operands
-static __host__ uint32_t add_cc(const uint32_t x, const uint32_t y, uint32_t &carry) {
-  uint32_t result;
-  result = x + y;
-  carry = x > result;
-  return result;
-}
-
-// return x + y + carry and carry out  with uint32_t operands
-static __host__ uint32_t addc_cc(const uint32_t x, const uint32_t y, uint32_t &carry) {
-  const uint32_t result = x + y + carry;
-  carry = carry && x >= result || !carry && x > result;
-  return result;
-}
-
-// return x - y with uint32_t operands
-static __host__ uint32_t sub(const uint32_t x, const uint32_t y) { return x - y; }
-
-// 	return x - y - borrow with uint32_t operands
-static __host__ uint32_t subc(const uint32_t x, const uint32_t y, const uint32_t borrow) { return x - y - borrow; }
-
-//	return x - y and borrow out with uint32_t operands
-static __host__ uint32_t sub_cc(const uint32_t x, const uint32_t y, uint32_t &borrow) {
-  uint32_t result;
-  result = x - y;
-  borrow = x < result;
-  return result;
-}
-
-//	return x - y - borrow and borrow out with uint32_t operands
-static __host__ uint32_t subc_cc(const uint32_t x, const uint32_t y, uint32_t &borrow) {
-  const uint32_t result = x - y - borrow;
-  borrow = borrow && x <= result || !borrow && x < result;
-  return result;
-}
-
-// return x * y + z + carry and carry out with uint32_t operands
-static __host__ uint32_t madc_cc(const uint32_t x, const uint32_t y, const uint32_t z, uint32_t &carry) {
-  uint32_t result;
-  uint64_t r = static_cast<uint64_t>(x) * y + z + carry;
-  carry = r >> 32;
-  result = r & 0xffffffff;
-  return result;
-}
-
-
-template <unsigned OPS_COUNT = UINT32_MAX, bool CARRY_IN = false, bool CARRY_OUT = false> struct carry_chain {
-  unsigned index;
-
-  constexpr __host__ __forceinline__ carry_chain() : index(0) {}
-
-  __host__ __forceinline__ uint32_t add(const uint32_t x, const uint32_t y, uint32_t &carry) {
-    index++;
-    if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
-      return host_math::add(x, y);
-    else if (index == 1 && !CARRY_IN)
-      return host_math::add_cc(x, y, carry);
-    else if (index < OPS_COUNT || CARRY_OUT)
-      return host_math::addc_cc(x, y, carry);
-    else
-      return host_math::addc(x, y, carry);
+  // return x + y and carry out with uint32_t operands
+  static __host__ uint32_t add_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
+  {
+    uint32_t result;
+    result = x + y;
+    carry = x > result;
+    return result;
  }

-  __host__ __forceinline__ uint32_t sub(const uint32_t x, const uint32_t y, uint32_t &carry) {
-    index++;
-    if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
-      return host_math::sub(x, y);
-    else if (index == 1 && !CARRY_IN)
-      return host_math::sub_cc(x, y, carry);
-    else if (index < OPS_COUNT || CARRY_OUT)
-      return host_math::subc_cc(x, y, carry);
-    else
-      return host_math::subc(x, y, carry);
+  // return x + y + carry and carry out  with uint32_t operands
+  static __host__ uint32_t addc_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
+  {
+    const uint32_t result = x + y + carry;
+    carry = carry && x >= result || !carry && x > result;
+    return result;
  }
-};
+
+  // return x - y with uint32_t operands
+  static __host__ uint32_t sub(const uint32_t x, const uint32_t y) { return x - y; }
+
+  // 	return x - y - borrow with uint32_t operands
+  static __host__ uint32_t subc(const uint32_t x, const uint32_t y, const uint32_t borrow) { return x - y - borrow; }
+
+  //	return x - y and borrow out with uint32_t operands
+  static __host__ uint32_t sub_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
+  {
+    uint32_t result;
+    result = x - y;
+    borrow = x < result;
+    return result;
+  }
+
+  //	return x - y - borrow and borrow out with uint32_t operands
+  static __host__ uint32_t subc_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
+  {
+    const uint32_t result = x - y - borrow;
+    borrow = borrow && x <= result || !borrow && x < result;
+    return result;
+  }
+
+  // return x * y + z + carry and carry out with uint32_t operands
+  static __host__ uint32_t madc_cc(const uint32_t x, const uint32_t y, const uint32_t z, uint32_t& carry)
+  {
+    uint32_t result;
+    uint64_t r = static_cast<uint64_t>(x) * y + z + carry;
+    carry = r >> 32;
+    result = r & 0xffffffff;
+    return result;
+  }
+
+  template <unsigned OPS_COUNT = UINT32_MAX, bool CARRY_IN = false, bool CARRY_OUT = false>
+  struct carry_chain {
+    unsigned index;
+
+    constexpr __host__ __forceinline__ carry_chain() : index(0) {}
+
+    __host__ __forceinline__ uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
+    {
+      index++;
+      if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
+        return host_math::add(x, y);
+      else if (index == 1 && !CARRY_IN)
+        return host_math::add_cc(x, y, carry);
+      else if (index < OPS_COUNT || CARRY_OUT)
+        return host_math::addc_cc(x, y, carry);
+      else
+        return host_math::addc(x, y, carry);
+    }
+
+    __host__ __forceinline__ uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
+    {
+      index++;
+      if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
+        return host_math::sub(x, y);
+      else if (index == 1 && !CARRY_IN)
+        return host_math::sub_cc(x, y, carry);
+      else if (index < OPS_COUNT || CARRY_OUT)
+        return host_math::subc_cc(x, y, carry);
+      else
+        return host_math::subc(x, y, carry);
+    }
+  };
 } // namespace host_math
--- a/icicle/utils/mont.cuh
+++ b/icicle/utils/mont.cuh
@@ -3,23 +3,25 @@
 #include "../appUtils/vector_manipulation/ve_mod_mult.cuh"

 template <typename E>
-int convert_montgomery(E *d_inout, size_t n_elments, bool is_into, cudaStream_t stream)
-{    
-    // Set the grid and block dimensions
-    int num_threads = MAX_THREADS_PER_BLOCK;
-    int num_blocks = (n_elments + num_threads - 1) / num_threads;
-    E mont = is_into ? E::montgomery_r() : E::montgomery_r_inv();
-    template_normalize_kernel<<<num_blocks, num_threads, 0, stream>>>(d_inout, n_elments, mont);
+int convert_montgomery(E* d_inout, size_t n_elments, bool is_into, cudaStream_t stream)
+{
+  // Set the grid and block dimensions
+  int num_threads = MAX_THREADS_PER_BLOCK;
+  int num_blocks = (n_elments + num_threads - 1) / num_threads;
+  E mont = is_into ? E::montgomery_r() : E::montgomery_r_inv();
+  template_normalize_kernel<<<num_blocks, num_threads, 0, stream>>>(d_inout, n_elments, mont);

-    return 0; //TODO: void with propper error handling
+  return 0; // TODO: void with propper error handling
 }

 template <typename E>
-int to_montgomery(E* d_inout, unsigned n, cudaStream_t stream) {
-    return convert_montgomery(d_inout, n, true, stream);
+int to_montgomery(E* d_inout, unsigned n, cudaStream_t stream)
+{
+  return convert_montgomery(d_inout, n, true, stream);
 }

 template <typename E>
-int from_montgomery(E* d_inout, unsigned n, cudaStream_t stream){
-    return convert_montgomery(d_inout, n, false, stream);
+int from_montgomery(E* d_inout, unsigned n, cudaStream_t stream)
+{
+  return convert_montgomery(d_inout, n, false, stream);
 }
--- a/icicle/utils/objects.cuh
+++ b/icicle/utils/objects.cuh
@@ -1,73 +1,63 @@
 #pragma once
-template < class F > class Element {
-    public: 
-      int v;
-      __device__ __host__ Element < F > () {
-          v = 0;
-      }
-      __device__ __host__ Element < F > (int r) {
-          v = r % F::q;
-          if (r == F::q) v = F::q; 
-      }
-      __device__ __host__ Element < F > operator + (Element < F >
-          const & obj) {
-          Element < F > res;
-          res.v = (v + obj.v) % F::q;
-          return res;
-      }
-      __device__ __host__ Element < F > operator - (Element < F >
-          const & obj) {
-          Element < F > res;
-          res.v = (v - obj.v) % F::q;
-          if (res.v < 0) {
-              res.v = F::q + res.v;
-          }
-          return res;
-      }
+template <class F>
+class Element
+{
+public:
+  int v;
+  __device__ __host__ Element<F>() { v = 0; }
+  __device__ __host__ Element<F>(int r)
+  {
+    v = r % F::q;
+    if (r == F::q) v = F::q;
+  }
+  __device__ __host__ Element<F> operator+(Element<F> const& obj)
+  {
+    Element<F> res;
+    res.v = (v + obj.v) % F::q;
+    return res;
+  }
+  __device__ __host__ Element<F> operator-(Element<F> const& obj)
+  {
+    Element<F> res;
+    res.v = (v - obj.v) % F::q;
+    if (res.v < 0) { res.v = F::q + res.v; }
+    return res;
+  }
 };

-template < class F > class Scalar {
-    public: 
-        int v;
-      __device__ __host__ Scalar < F > () {
-          v = 0;
-      }
-      __device__ __host__ Scalar < F > (int r) {
-          v = r % F::q;
-      }
-      __device__ __host__ Scalar < F > operator + (Scalar < F >
-          const & obj) {
-          Scalar < F > res;
-          res.v = (v + obj.v) % F::q;
-          return res;
-      }
-      __device__ __host__ Scalar < F > operator * (Scalar < F >
-          const & obj) {
-          Scalar < F > res;
-          res.v = (v * obj.v) % F::q;
-          return res;
-      }
-      __device__ __host__ Element < F > operator * (Element < F >
-          const & obj) {
-          Element < F > res;
-          res.v = (v * obj.v) % F::q;
-          return res;
-      }
-      Scalar < F > operator - (Scalar < F > const & obj) {
-        Scalar < F > res;
-        res.v = (v - obj.v) % F::q;
-        if (res.v < 0) {
-            res.v = F::q + res.v;
-        }
-        return res;
-    }
-    bool operator < (Scalar < F > const & obj) {
-        return v < obj.v;
-    }
-    static Scalar<F> one(){
-        return Scalar<F>(1);
-    }
-    static Scalar<F> zero(){
-        return Scalar<F>(0);
-    }
+template <class F>
+class Scalar
+{
+public:
+  int v;
+  __device__ __host__ Scalar<F>() { v = 0; }
+  __device__ __host__ Scalar<F>(int r) { v = r % F::q; }
+  __device__ __host__ Scalar<F> operator+(Scalar<F> const& obj)
+  {
+    Scalar<F> res;
+    res.v = (v + obj.v) % F::q;
+    return res;
+  }
+  __device__ __host__ Scalar<F> operator*(Scalar<F> const& obj)
+  {
+    Scalar<F> res;
+    res.v = (v * obj.v) % F::q;
+    return res;
+  }
+  __device__ __host__ Element<F> operator*(Element<F> const& obj)
+  {
+    Element<F> res;
+    res.v = (v * obj.v) % F::q;
+    return res;
+  }
+  Scalar<F> operator-(Scalar<F> const& obj)
+  {
+    Scalar<F> res;
+    res.v = (v - obj.v) % F::q;
+    if (res.v < 0) { res.v = F::q + res.v; }
+    return res;
+  }
+  bool operator<(Scalar<F> const& obj) { return v < obj.v; }
+  static Scalar<F> one() { return Scalar<F>(1); }
+  static Scalar<F> zero() { return Scalar<F>(0); }
 };
--- a/icicle/utils/ptx.cuh
+++ b/icicle/utils/ptx.cuh
@@ -4,238 +4,279 @@

 namespace ptx {

-__device__ __forceinline__ uint32_t add(const uint32_t x, const uint32_t y) {
-  uint32_t result;
-  asm("add.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
-  return result;
-}
+  __device__ __forceinline__ uint32_t add(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm("add.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }

-__device__ __forceinline__ uint32_t add_cc(const uint32_t x, const uint32_t y) {
-  uint32_t result;
-  asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
-  return result;
-}
+  __device__ __forceinline__ uint32_t add_cc(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }

-__device__ __forceinline__ uint32_t addc(const uint32_t x, const uint32_t y) {
-  uint32_t result;
-  asm volatile("addc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
-  return result;
-}
+  __device__ __forceinline__ uint32_t addc(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm volatile("addc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }

-__device__ __forceinline__ uint32_t addc_cc(const uint32_t x, const uint32_t y) {
-  uint32_t result;
-  asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
-  return result;
-}
+  __device__ __forceinline__ uint32_t addc_cc(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }

-__device__ __forceinline__ uint32_t sub(const uint32_t x, const uint32_t y) {
-  uint32_t result;
-  asm("sub.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
-  return result;
-}
+  __device__ __forceinline__ uint32_t sub(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm("sub.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }

-__device__ __forceinline__ uint32_t sub_cc(const uint32_t x, const uint32_t y) {
-  uint32_t result;
-  asm volatile("sub.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
-  return result;
-}
+  __device__ __forceinline__ uint32_t sub_cc(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm volatile("sub.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }

-__device__ __forceinline__ uint32_t subc(const uint32_t x, const uint32_t y) {
-  uint32_t result;
-  asm volatile("subc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
-  return result;
-}
+  __device__ __forceinline__ uint32_t subc(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm volatile("subc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }

-__device__ __forceinline__ uint32_t subc_cc(const uint32_t x, const uint32_t y) {
-  uint32_t result;
-  asm volatile("subc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
-  return result;
-}
+  __device__ __forceinline__ uint32_t subc_cc(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm volatile("subc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }

-__device__ __forceinline__ uint32_t mul_lo(const uint32_t x, const uint32_t y) {
-  uint32_t result;
-  asm("mul.lo.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
-  return result;
-}
+  __device__ __forceinline__ uint32_t mul_lo(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm("mul.lo.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }

-__device__ __forceinline__ uint32_t mul_hi(const uint32_t x, const uint32_t y) {
-  uint32_t result;
-  asm("mul.hi.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
-  return result;
-}
+  __device__ __forceinline__ uint32_t mul_hi(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm("mul.hi.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }

-__device__ __forceinline__ uint32_t mad_lo(const uint32_t x, const uint32_t y, const uint32_t z) {
-  uint32_t result;
-  asm("mad.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
-  return result;
-}
+  __device__ __forceinline__ uint32_t mad_lo(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm("mad.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }

-__device__ __forceinline__ uint32_t mad_hi(const uint32_t x, const uint32_t y, const uint32_t z) {
-  uint32_t result;
-  asm("mad.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
-  return result;
-}
+  __device__ __forceinline__ uint32_t mad_hi(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm("mad.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }

-__device__ __forceinline__ uint32_t mad_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z) {
-  uint32_t result;
-  asm volatile("mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
-  return result;
-}
+  __device__ __forceinline__ uint32_t mad_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm volatile("mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }

-__device__ __forceinline__ uint32_t mad_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z) {
-  uint32_t result;
-  asm volatile("mad.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
-  return result;
-}
+  __device__ __forceinline__ uint32_t mad_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm volatile("mad.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }

-__device__ __forceinline__ uint32_t madc_lo(const uint32_t x, const uint32_t y, const uint32_t z) {
-  uint32_t result;
-  asm volatile("madc.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
-  return result;
-}
+  __device__ __forceinline__ uint32_t madc_lo(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm volatile("madc.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }

-__device__ __forceinline__ uint32_t madc_hi(const uint32_t x, const uint32_t y, const uint32_t z) {
-  uint32_t result;
-  asm volatile("madc.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
-  return result;
-}
+  __device__ __forceinline__ uint32_t madc_hi(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm volatile("madc.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }

-__device__ __forceinline__ uint32_t madc_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z) {
-  uint32_t result;
-  asm volatile("madc.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
-  return result;
-}
+  __device__ __forceinline__ uint32_t madc_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm volatile("madc.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }

-__device__ __forceinline__ uint32_t madc_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z) {
-  uint32_t result;
-  asm volatile("madc.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
-  return result;
-}
+  __device__ __forceinline__ uint32_t madc_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm volatile("madc.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }

-__device__ __forceinline__ uint64_t mov_b64(uint32_t lo, uint32_t hi) {
-  uint64_t result;
-  asm("mov.b64 %0, {%1,%2};" : "=l"(result) : "r"(lo), "r"(hi));
-  return result;
-}
+  __device__ __forceinline__ uint64_t mov_b64(uint32_t lo, uint32_t hi)
+  {
+    uint64_t result;
+    asm("mov.b64 %0, {%1,%2};" : "=l"(result) : "r"(lo), "r"(hi));
+    return result;
+  }

-// Gives u64 overloads a dedicated namespace.
-// Callers should know exactly what they're calling (no implicit conversions).
-namespace u64 {
+  // Gives u64 overloads a dedicated namespace.
+  // Callers should know exactly what they're calling (no implicit conversions).
+  namespace u64 {

-__device__ __forceinline__ uint64_t add(const uint64_t x, const uint64_t y) {
-  uint64_t result;
-  asm("add.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
-  return result;
-}
+    __device__ __forceinline__ uint64_t add(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm("add.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }

-__device__ __forceinline__ uint64_t add_cc(const uint64_t x, const uint64_t y) {
-  uint64_t result;
-  asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
-  return result;
-}
+    __device__ __forceinline__ uint64_t add_cc(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }

-__device__ __forceinline__ uint64_t addc(const uint64_t x, const uint64_t y) {
-  uint64_t result;
-  asm volatile("addc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
-  return result;
-}
+    __device__ __forceinline__ uint64_t addc(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm volatile("addc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }

-__device__ __forceinline__ uint64_t addc_cc(const uint64_t x, const uint64_t y) {
-  uint64_t result;
-  asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
-  return result;
-}
+    __device__ __forceinline__ uint64_t addc_cc(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }

-__device__ __forceinline__ uint64_t sub(const uint64_t x, const uint64_t y) {
-  uint64_t result;
-  asm("sub.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
-  return result;
-}
+    __device__ __forceinline__ uint64_t sub(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm("sub.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }

-__device__ __forceinline__ uint64_t sub_cc(const uint64_t x, const uint64_t y) {
-  uint64_t result;
-  asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
-  return result;
-}
+    __device__ __forceinline__ uint64_t sub_cc(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }

-__device__ __forceinline__ uint64_t subc(const uint64_t x, const uint64_t y) {
-  uint64_t result;
-  asm volatile("subc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
-  return result;
-}
+    __device__ __forceinline__ uint64_t subc(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm volatile("subc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }

-__device__ __forceinline__ uint64_t subc_cc(const uint64_t x, const uint64_t y) {
-  uint64_t result;
-  asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
-  return result;
-}
+    __device__ __forceinline__ uint64_t subc_cc(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }

-__device__ __forceinline__ uint64_t mul_lo(const uint64_t x, const uint64_t y) {
-  uint64_t result;
-  asm("mul.lo.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
-  return result;
-}
+    __device__ __forceinline__ uint64_t mul_lo(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm("mul.lo.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }

-__device__ __forceinline__ uint64_t mul_hi(const uint64_t x, const uint64_t y) {
-  uint64_t result;
-  asm("mul.hi.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
-  return result;
-}
+    __device__ __forceinline__ uint64_t mul_hi(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm("mul.hi.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }

-__device__ __forceinline__ uint64_t mad_lo(const uint64_t x, const uint64_t y, const uint64_t z) {
-  uint64_t result;
-  asm("mad.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
-  return result;
-}
+    __device__ __forceinline__ uint64_t mad_lo(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm("mad.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }

-__device__ __forceinline__ uint64_t mad_hi(const uint64_t x, const uint64_t y, const uint64_t z) {
-  uint64_t result;
-  asm("mad.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
-  return result;
-}
+    __device__ __forceinline__ uint64_t mad_hi(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm("mad.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }

-__device__ __forceinline__ uint64_t mad_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z) {
-  uint64_t result;
-  asm volatile("mad.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
-  return result;
-}
+    __device__ __forceinline__ uint64_t mad_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm volatile("mad.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }

-__device__ __forceinline__ uint64_t mad_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z) {
-  uint64_t result;
-  asm volatile("mad.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
-  return result;
-}
+    __device__ __forceinline__ uint64_t mad_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm volatile("mad.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }

-__device__ __forceinline__ uint64_t madc_lo(const uint64_t x, const uint64_t y, const uint64_t z) {
-  uint64_t result;
-  asm volatile("madc.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
-  return result;
-}
+    __device__ __forceinline__ uint64_t madc_lo(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm volatile("madc.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }

-__device__ __forceinline__ uint64_t madc_hi(const uint64_t x, const uint64_t y, const uint64_t z) {
-  uint64_t result;
-  asm volatile("madc.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
-  return result;
-}
+    __device__ __forceinline__ uint64_t madc_hi(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm volatile("madc.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }

-__device__ __forceinline__ uint64_t madc_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z) {
-  uint64_t result;
-  asm volatile("madc.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
-  return result;
-}
+    __device__ __forceinline__ uint64_t madc_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm volatile("madc.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }

-__device__ __forceinline__ uint64_t madc_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z) {
-  uint64_t result;
-  asm volatile("madc.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
-  return result;
-}
+    __device__ __forceinline__ uint64_t madc_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm volatile("madc.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }

-} // namespace u64
+  } // namespace u64

-__device__ __forceinline__ void bar_arrive(const unsigned name, const unsigned count) {
-  asm volatile("bar.arrive %0, %1;" : : "r"(name), "r"(count) : "memory");
-}
+  __device__ __forceinline__ void bar_arrive(const unsigned name, const unsigned count)
+  {
+    asm volatile("bar.arrive %0, %1;" : : "r"(name), "r"(count) : "memory");
+  }

-__device__ __forceinline__ void bar_sync(const unsigned name, const unsigned count) { asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(count) : "memory"); }
+  __device__ __forceinline__ void bar_sync(const unsigned name, const unsigned count)
+  {
+    asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(count) : "memory");
+  }

 } // namespace ptx
--- a/icicle/utils/sharedmem.cuh
+++ b/icicle/utils/sharedmem.cuh
@@ -1,15 +1,15 @@
 // based on https://leimao.github.io/blog/CUDA-Shared-Memory-Templated-Kernel/
-// may be outdated, but only worked like that 
+// may be outdated, but only worked like that

 // -------------------------------------------------------------
 // cuDPP -- CUDA Data Parallel Primitives library
 // -------------------------------------------------------------
 // $Revision: 5636 $
 // $Date: 2009-07-02 13:39:38 +1000 (Thu, 02 Jul 2009) $
-// ------------------------------------------------------------- 
-// This source code is distributed under the terms of license.txt 
+// -------------------------------------------------------------
+// This source code is distributed under the terms of license.txt
 // in the root directory of this source distribution.
-// ------------------------------------------------------------- 
+// -------------------------------------------------------------

 /**
 * @file
@@ -18,18 +18,18 @@
 * @brief Shared memory declaration struct for templatized types.
 *
 * Because dynamically sized shared memory arrays are declared "extern" in CUDA,
- * we can't templatize their types directly.  To get around this, we declare a 
- * simple wrapper struct that will declare the extern array with a different 
+ * we can't templatize their types directly.  To get around this, we declare a
+ * simple wrapper struct that will declare the extern array with a different
 * name depending on the type.  This avoids linker errors about multiple
 * definitions.
- * 
- * To use dynamically allocated shared memory in a templatized __global__ or 
+ *
+ * To use dynamically allocated shared memory in a templatized __global__ or
 * __device__ function, just replace code like this:
 *
 * <pre>
 *  template<class T>
 *  __global__ void
- *  foo( T* d_out, T* d_in) 
+ *  foo( T* d_out, T* d_in)
 *  {
 *      // Shared mem size is determined by the host app at run time
 *      extern __shared__  T sdata[];
@@ -38,12 +38,12 @@
 *      ...
 *  }
 * </pre>
- *  
+ *
 *  With this
 * <pre>
 *  template<class T>
 *  __global__ void
- *  foo( T* d_out, T* d_in) 
+ *  foo( T* d_out, T* d_in)
 *  {
 *      // Shared mem size is determined by the host app at run time
 *      SharedMemory<T> smem;
@@ -58,33 +58,32 @@
 #ifndef _SHAREDMEM_H_
 #define _SHAREDMEM_H_

-#include "../curves/bls12_381/curve_config.cuh"
 #include "../curves/bls12_377/curve_config.cuh"
+#include "../curves/bls12_381/curve_config.cuh"
 #include "../curves/bn254/curve_config.cuh"

 /** @brief Wrapper class for templatized dynamic shared memory arrays.
-  * 
-  * This struct uses template specialization on the type \a T to declare
-  * a differently named dynamic shared memory array for each type
-  * (\code extern __shared__ T s_type[] \endcode).
-  * 
-  * Currently there are specializations for the following types:
-  * \c int, \c uint, \c char, \c uchar, \c short, \c ushort, \c long, 
-  * \c unsigned long, \c bool, \c float, and \c double. One can also specialize it
-  * for user defined types.
-  */
+ *
+ * This struct uses template specialization on the type \a T to declare
+ * a differently named dynamic shared memory array for each type
+ * (\code extern __shared__ T s_type[] \endcode).
+ *
+ * Currently there are specializations for the following types:
+ * \c int, \c uint, \c char, \c uchar, \c short, \c ushort, \c long,
+ * \c unsigned long, \c bool, \c float, and \c double. One can also specialize it
+ * for user defined types.
+ */
 template <typename T>
-struct SharedMemory
-{
-    //! @brief Return a pointer to the runtime-sized shared memory array.
-    //! @returns Pointer to runtime-sized shared memory array
-    __device__ T* getPointer() 
-    { 
-        extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
-        Error_UnsupportedType();
-        return (T*)0;
-    }
-    // TODO: Use operator overloading to make this class look like a regular array
+struct SharedMemory {
+  //! @brief Return a pointer to the runtime-sized shared memory array.
+  //! @returns Pointer to runtime-sized shared memory array
+  __device__ T* getPointer()
+  {
+    extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
+    Error_UnsupportedType();
+    return (T*)0;
+  }
+  // TODO: Use operator overloading to make this class look like a regular array
 };

 // Following are the specializations for the following types.
@@ -92,124 +91,183 @@ struct SharedMemory
 // One could also specialize it for user-defined types.

 template <>
-struct SharedMemory <int>
-{
-    __device__ int* getPointer() { extern __shared__ int s_int[]; return s_int; }      
+struct SharedMemory<int> {
+  __device__ int* getPointer()
+  {
+    extern __shared__ int s_int[];
+    return s_int;
+  }
 };

 template <>
-struct SharedMemory <unsigned int>
-{
-    __device__ unsigned int* getPointer() { extern __shared__ unsigned int s_uint[]; return s_uint; }    
+struct SharedMemory<unsigned int> {
+  __device__ unsigned int* getPointer()
+  {
+    extern __shared__ unsigned int s_uint[];
+    return s_uint;
+  }
 };

 template <>
-struct SharedMemory <char>
-{
-    __device__ char* getPointer() { extern __shared__ char s_char[]; return s_char; }    
+struct SharedMemory<char> {
+  __device__ char* getPointer()
+  {
+    extern __shared__ char s_char[];
+    return s_char;
+  }
 };

 template <>
-struct SharedMemory <unsigned char>
-{
-    __device__ unsigned char* getPointer() { extern __shared__ unsigned char s_uchar[]; return s_uchar; }    
+struct SharedMemory<unsigned char> {
+  __device__ unsigned char* getPointer()
+  {
+    extern __shared__ unsigned char s_uchar[];
+    return s_uchar;
+  }
 };

 template <>
-struct SharedMemory <short>
-{
-    __device__ short* getPointer() { extern __shared__ short s_short[]; return s_short; }    
+struct SharedMemory<short> {
+  __device__ short* getPointer()
+  {
+    extern __shared__ short s_short[];
+    return s_short;
+  }
 };

 template <>
-struct SharedMemory <unsigned short>
-{
-    __device__ unsigned short* getPointer() { extern __shared__ unsigned short s_ushort[]; return s_ushort; }    
+struct SharedMemory<unsigned short> {
+  __device__ unsigned short* getPointer()
+  {
+    extern __shared__ unsigned short s_ushort[];
+    return s_ushort;
+  }
 };

 template <>
-struct SharedMemory <long>
-{
-    __device__ long* getPointer() { extern __shared__ long s_long[]; return s_long; }    
+struct SharedMemory<long> {
+  __device__ long* getPointer()
+  {
+    extern __shared__ long s_long[];
+    return s_long;
+  }
 };

 template <>
-struct SharedMemory <unsigned long>
-{
-    __device__ unsigned long* getPointer() { extern __shared__ unsigned long s_ulong[]; return s_ulong; }    
+struct SharedMemory<unsigned long> {
+  __device__ unsigned long* getPointer()
+  {
+    extern __shared__ unsigned long s_ulong[];
+    return s_ulong;
+  }
 };

 template <>
-struct SharedMemory <long long>
-{
-    __device__ long long* getPointer() { extern __shared__ long long s_longlong[]; return s_longlong; }    
+struct SharedMemory<long long> {
+  __device__ long long* getPointer()
+  {
+    extern __shared__ long long s_longlong[];
+    return s_longlong;
+  }
 };

 template <>
-struct SharedMemory <unsigned long long>
-{
-    __device__ unsigned long long* getPointer() { extern __shared__ unsigned long long s_ulonglong[]; return s_ulonglong; }    
+struct SharedMemory<unsigned long long> {
+  __device__ unsigned long long* getPointer()
+  {
+    extern __shared__ unsigned long long s_ulonglong[];
+    return s_ulonglong;
+  }
 };

 template <>
-struct SharedMemory <bool>
-{
-    __device__ bool* getPointer() { extern __shared__ bool s_bool[]; return s_bool; }    
+struct SharedMemory<bool> {
+  __device__ bool* getPointer()
+  {
+    extern __shared__ bool s_bool[];
+    return s_bool;
+  }
 };

 template <>
-struct SharedMemory <float>
-{
-    __device__ float* getPointer() { extern __shared__ float s_float[]; return s_float; }    
+struct SharedMemory<float> {
+  __device__ float* getPointer()
+  {
+    extern __shared__ float s_float[];
+    return s_float;
+  }
 };

 template <>
-struct SharedMemory <double>
-{
-    __device__ double* getPointer() { extern __shared__ double s_double[]; return s_double; }    
+struct SharedMemory<double> {
+  __device__ double* getPointer()
+  {
+    extern __shared__ double s_double[];
+    return s_double;
+  }
 };

 template <>
-struct SharedMemory <uchar4>
-{
-    __device__ uchar4* getPointer() { extern __shared__ uchar4 s_uchar4[]; return s_uchar4; }    
+struct SharedMemory<uchar4> {
+  __device__ uchar4* getPointer()
+  {
+    extern __shared__ uchar4 s_uchar4[];
+    return s_uchar4;
+  }
 };

 template <>
-struct SharedMemory <BLS12_381::scalar_t>
-{
-    __device__ BLS12_381::scalar_t* getPointer() { extern __shared__ BLS12_381::scalar_t s_scalar_t_bls12_381[]; return s_scalar_t_bls12_381; }    
+struct SharedMemory<BLS12_381::scalar_t> {
+  __device__ BLS12_381::scalar_t* getPointer()
+  {
+    extern __shared__ BLS12_381::scalar_t s_scalar_t_bls12_381[];
+    return s_scalar_t_bls12_381;
+  }
 };

 template <>
-struct SharedMemory <BLS12_381::projective_t>
-{
-    __device__ BLS12_381::projective_t* getPointer() { extern __shared__ BLS12_381::projective_t s_projective_t_bls12_381[]; return s_projective_t_bls12_381; }    
+struct SharedMemory<BLS12_381::projective_t> {
+  __device__ BLS12_381::projective_t* getPointer()
+  {
+    extern __shared__ BLS12_381::projective_t s_projective_t_bls12_381[];
+    return s_projective_t_bls12_381;
+  }
 };

 template <>
-struct SharedMemory <BLS12_377::scalar_t>
-{
-    __device__ BLS12_377::scalar_t* getPointer() { extern __shared__ BLS12_377::scalar_t s_scalar_t_bls12_377[]; return s_scalar_t_bls12_377; }    
+struct SharedMemory<BLS12_377::scalar_t> {
+  __device__ BLS12_377::scalar_t* getPointer()
+  {
+    extern __shared__ BLS12_377::scalar_t s_scalar_t_bls12_377[];
+    return s_scalar_t_bls12_377;
+  }
 };

 template <>
-struct SharedMemory <BLS12_377::projective_t>
-{
-    __device__ BLS12_377::projective_t* getPointer() { extern __shared__ BLS12_377::projective_t s_projective_t_bls12_377[]; return s_projective_t_bls12_377; }    
-};
-
-
-template <>
-struct SharedMemory <BN254::scalar_t>
-{
-    __device__ BN254::scalar_t* getPointer() { extern __shared__ BN254::scalar_t s_scalar_t_bn254[]; return s_scalar_t_bn254; }    
+struct SharedMemory<BLS12_377::projective_t> {
+  __device__ BLS12_377::projective_t* getPointer()
+  {
+    extern __shared__ BLS12_377::projective_t s_projective_t_bls12_377[];
+    return s_projective_t_bls12_377;
+  }
 };

 template <>
-struct SharedMemory <BN254::projective_t>
-{
-    __device__ BN254::projective_t* getPointer() { extern __shared__ BN254::projective_t s_projective_t_bn254[]; return s_projective_t_bn254; }    
+struct SharedMemory<BN254::scalar_t> {
+  __device__ BN254::scalar_t* getPointer()
+  {
+    extern __shared__ BN254::scalar_t s_scalar_t_bn254[];
+    return s_scalar_t_bn254;
+  }
+};
+
+template <>
+struct SharedMemory<BN254::projective_t> {
+  __device__ BN254::projective_t* getPointer()
+  {
+    extern __shared__ BN254::projective_t s_projective_t_bn254[];
+    return s_projective_t_bn254;
+  }
 };
 #endif //_SHAREDMEM_H_

--- a/icicle/utils/storage.cuh
+++ b/icicle/utils/storage.cuh
@@ -3,11 +3,15 @@

 #define LIMBS_ALIGNMENT(x) ((x) % 4 == 0 ? 16 : ((x) % 2 == 0 ? 8 : 4))

-template <unsigned LIMBS_COUNT> struct __align__(LIMBS_ALIGNMENT(LIMBS_COUNT)) storage {
+template <unsigned LIMBS_COUNT>
+struct __align__(LIMBS_ALIGNMENT(LIMBS_COUNT)) storage
+{
  static constexpr unsigned LC = LIMBS_COUNT;
  uint32_t limbs[LIMBS_COUNT];
 };

-template <unsigned OMEGAS_COUNT, unsigned LIMBS_COUNT> struct __align__(LIMBS_ALIGNMENT(LIMBS_COUNT)) storage_array {
-    storage<LIMBS_COUNT> storages[OMEGAS_COUNT];
+template <unsigned OMEGAS_COUNT, unsigned LIMBS_COUNT>
+struct __align__(LIMBS_ALIGNMENT(LIMBS_COUNT)) storage_array
+{
+  storage<LIMBS_COUNT> storages[OMEGAS_COUNT];
 };
--- a/src/curve_templates/curve_different_limbs.rs
+++ b/src/curve_templates/curve_different_limbs.rs
@@ -1,13 +1,10 @@
 use std::ffi::c_uint;
-
 use ark_CURVE_NAME_L::{Fq as Fq_CURVE_NAME_U, Fr as Fr_CURVE_NAME_U, G1Affine as G1Affine_CURVE_NAME_U, G1Projective as G1Projective_CURVE_NAME_U};
-
 use ark_ec::AffineCurve;
 use ark_ff::{BigInteger_limbs_q, BigInteger_limbs_p, PrimeField};
 use std::mem::transmute;
 use ark_ff::Field;
 use crate::{utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec}};
-
 use rustacuda_core::DeviceCopy;
 use rustacuda_derive::DeviceCopy;

@@ -143,7 +140,6 @@ impl Point_CURVE_NAME_U {

    pub fn to_ark_affine(&self) -> G1Affine_CURVE_NAME_U {
        //TODO: generic conversion
-        use ark_ff::Field;
        use std::ops::Mul;
        let proj_x_field = Fq_CURVE_NAME_U::from_le_bytes_mod_order(&self.x.to_bytes_le());
        let proj_y_field = Fq_CURVE_NAME_U::from_le_bytes_mod_order(&self.y.to_bytes_le());
@@ -155,7 +151,6 @@ impl Point_CURVE_NAME_U {
    }

    pub fn from_ark(ark: G1Projective_CURVE_NAME_U) -> Point_CURVE_NAME_U {
-        use ark_ff::Field;
        let z_inv = ark.z.inverse().unwrap();
        let z_invsq = z_inv * z_inv;
        let z_invq3 = z_invsq * z_inv;
--- a/src/curve_templates/curve_same_limbs.rs
+++ b/src/curve_templates/curve_same_limbs.rs
@@ -1,13 +1,10 @@
 use std::ffi::c_uint;
-
 use ark_CURVE_NAME_L::{Fq as Fq_CURVE_NAME_U, Fr as Fr_CURVE_NAME_U, G1Affine as G1Affine_CURVE_NAME_U, G1Projective as G1Projective_CURVE_NAME_U};
-
 use ark_ec::AffineCurve;
 use ark_ff::{BigInteger_limbs_p, PrimeField};
 use std::mem::transmute;
 use ark_ff::Field;
 use crate::{utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec}};
-
 use rustacuda_core::DeviceCopy;
 use rustacuda_derive::DeviceCopy;

--- a/src/curve_templates/test.rs
+++ b/src/curve_templates/test.rs
@@ -1,14 +1,9 @@
 use std::ffi::{c_int, c_uint};
-
 use rand::{rngs::StdRng, RngCore, SeedableRng};
-
-
 use crate::curves::CURVE_NAME_L::*;
-
 use ark_CURVE_NAME_L::{Fr as Fr_CURVE_NAME_U, G1Projective as G1Projective_CURVE_NAME_U};
 use ark_ff::PrimeField;
 use ark_std::UniformRand;
-
 use rustacuda::prelude::*;
 use rustacuda_core::DevicePointer;
 use rustacuda::memory::{DeviceBox, CopyDestination, DeviceCopy};
--- a/src/curves/bls12_377.rs
+++ b/src/curves/bls12_377.rs
@@ -1,15 +1,12 @@
-use std::ffi::c_uint;
-
-use ark_bls12_377::{Fq as Fq_BLS12_377, Fr as Fr_BLS12_377, G1Affine as G1Affine_BLS12_377, G1Projective as G1Projective_BLS12_377};
-
+use crate::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
+use ark_bls12_377::{Fq as Fq_BLS12_377, G1Affine as G1Affine_BLS12_377, G1Projective as G1Projective_BLS12_377};
 use ark_ec::AffineCurve;
-use ark_ff::{BigInteger384, BigInteger256, PrimeField};
-use std::mem::transmute;
 use ark_ff::Field;
-use crate::{utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec}};
-
+use ark_ff::{BigInteger256, BigInteger384, PrimeField};
 use rustacuda_core::DeviceCopy;
 use rustacuda_derive::DeviceCopy;
+use std::ffi::c_uint;
+use std::mem::transmute;

 #[derive(Debug, PartialEq, Copy, Clone)]
 #[repr(C)]
@@ -27,9 +24,7 @@ impl<const NUM_LIMBS: usize> Default for Field_BLS12_377<NUM_LIMBS> {

 impl<const NUM_LIMBS: usize> Field_BLS12_377<NUM_LIMBS> {
    pub fn zero() -> Self {
-        Field_BLS12_377 {
-            s: [0u32; NUM_LIMBS],
-        }
+        Field_BLS12_377 { s: [0u32; NUM_LIMBS] }
    }

    pub fn one() -> Self {
@@ -41,7 +36,10 @@ impl<const NUM_LIMBS: usize> Field_BLS12_377<NUM_LIMBS> {
    fn to_bytes_le(&self) -> Vec<u8> {
        self.s
            .iter()
-            .map(|s| s.to_le_bytes().to_vec())
+            .map(|s| {
+                s.to_le_bytes()
+                    .to_vec()
+            })
            .flatten()
            .collect::<Vec<_>>()
    }
@@ -50,7 +48,9 @@ impl<const NUM_LIMBS: usize> Field_BLS12_377<NUM_LIMBS> {
 pub const BASE_LIMBS_BLS12_377: usize = 12;
 pub const SCALAR_LIMBS_BLS12_377: usize = 8;

+#[allow(non_camel_case_types)]
 pub type BaseField_BLS12_377 = Field_BLS12_377<BASE_LIMBS_BLS12_377>;
+#[allow(non_camel_case_types)]
 pub type ScalarField_BLS12_377 = Field_BLS12_377<SCALAR_LIMBS_BLS12_377>;

 fn get_fixed_limbs<const NUM_LIMBS: usize>(val: &[u32]) -> [u32; NUM_LIMBS] {
@@ -60,7 +60,9 @@ fn get_fixed_limbs<const NUM_LIMBS: usize>(val: &[u32]) -> [u32; NUM_LIMBS] {
            padded[..val.len()].copy_from_slice(&val);
            padded
        }
-        n if n == NUM_LIMBS => val.try_into().unwrap(),
+        n if n == NUM_LIMBS => val
+            .try_into()
+            .unwrap(),
        _ => panic!("slice has too many elements"),
    }
 }
@@ -77,7 +79,11 @@ impl BaseField_BLS12_377 {
    }

    pub fn to_ark(&self) -> BigInteger384 {
-        BigInteger384::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
+        BigInteger384::new(
+            u32_vec_to_u64_vec(&self.limbs())
+                .try_into()
+                .unwrap(),
+        )
    }

    pub fn from_ark(ark: BigInteger384) -> Self {
@@ -91,7 +97,11 @@ impl ScalarField_BLS12_377 {
    }

    pub fn to_ark(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
+        BigInteger256::new(
+            u32_vec_to_u64_vec(&self.limbs())
+                .try_into()
+                .unwrap(),
+        )
    }

    pub fn from_ark(ark: BigInteger256) -> Self {
@@ -136,25 +146,41 @@ impl Point_BLS12_377 {

    pub fn to_ark(&self) -> G1Projective_BLS12_377 {
        //TODO: generic conversion
-        self.to_ark_affine().into_projective()
+        self.to_ark_affine()
+            .into_projective()
    }

    pub fn to_ark_affine(&self) -> G1Affine_BLS12_377 {
        //TODO: generic conversion
-        use ark_ff::Field;
        use std::ops::Mul;
-        let proj_x_field = Fq_BLS12_377::from_le_bytes_mod_order(&self.x.to_bytes_le());
-        let proj_y_field = Fq_BLS12_377::from_le_bytes_mod_order(&self.y.to_bytes_le());
-        let proj_z_field = Fq_BLS12_377::from_le_bytes_mod_order(&self.z.to_bytes_le());
-        let inverse_z = proj_z_field.inverse().unwrap();
+        let proj_x_field = Fq_BLS12_377::from_le_bytes_mod_order(
+            &self
+                .x
+                .to_bytes_le(),
+        );
+        let proj_y_field = Fq_BLS12_377::from_le_bytes_mod_order(
+            &self
+                .y
+                .to_bytes_le(),
+        );
+        let proj_z_field = Fq_BLS12_377::from_le_bytes_mod_order(
+            &self
+                .z
+                .to_bytes_le(),
+        );
+        let inverse_z = proj_z_field
+            .inverse()
+            .unwrap();
        let aff_x = proj_x_field.mul(inverse_z);
        let aff_y = proj_y_field.mul(inverse_z);
        G1Affine_BLS12_377::new(aff_x, aff_y, false)
    }

    pub fn from_ark(ark: G1Projective_BLS12_377) -> Point_BLS12_377 {
-        use ark_ff::Field;
-        let z_inv = ark.z.inverse().unwrap();
+        let z_inv = ark
+            .z
+            .inverse()
+            .unwrap();
        let z_invsq = z_inv * z_inv;
        let z_invq3 = z_invsq * z_inv;
        Point_BLS12_377 {
@@ -196,17 +222,19 @@ impl PointAffineNoInfinity_BLS12_377 {
    ///From u32 limbs x,y
    pub fn from_limbs(x: &[u32], y: &[u32]) -> Self {
        PointAffineNoInfinity_BLS12_377 {
-            x: BaseField_BLS12_377 {
-                s: get_fixed_limbs(x),
-            },
-            y: BaseField_BLS12_377 {
-                s: get_fixed_limbs(y),
-            },
+            x: BaseField_BLS12_377 { s: get_fixed_limbs(x) },
+            y: BaseField_BLS12_377 { s: get_fixed_limbs(y) },
        }
    }

    pub fn limbs(&self) -> Vec<u32> {
-        [self.x.limbs(), self.y.limbs()].concat()
+        [
+            self.x
+                .limbs(),
+            self.y
+                .limbs(),
+        ]
+        .concat()
    }

    pub fn to_projective(&self) -> Point_BLS12_377 {
@@ -218,13 +246,31 @@ impl PointAffineNoInfinity_BLS12_377 {
    }

    pub fn to_ark(&self) -> G1Affine_BLS12_377 {
-        G1Affine_BLS12_377::new(Fq_BLS12_377::new(self.x.to_ark()), Fq_BLS12_377::new(self.y.to_ark()), false)
+        G1Affine_BLS12_377::new(
+            Fq_BLS12_377::new(
+                self.x
+                    .to_ark(),
+            ),
+            Fq_BLS12_377::new(
+                self.y
+                    .to_ark(),
+            ),
+            false,
+        )
    }

    pub fn to_ark_repr(&self) -> G1Affine_BLS12_377 {
        G1Affine_BLS12_377::new(
-            Fq_BLS12_377::from_repr(self.x.to_ark()).unwrap(),
-            Fq_BLS12_377::from_repr(self.y.to_ark()).unwrap(),
+            Fq_BLS12_377::from_repr(
+                self.x
+                    .to_ark(),
+            )
+            .unwrap(),
+            Fq_BLS12_377::from_repr(
+                self.y
+                    .to_ark(),
+            )
+            .unwrap(),
            false,
        )
    }
@@ -242,30 +288,35 @@ impl Point_BLS12_377 {

    pub fn from_limbs(x: &[u32], y: &[u32], z: &[u32]) -> Self {
        Point_BLS12_377 {
-            x: BaseField_BLS12_377 {
-                s: get_fixed_limbs(x),
-            },
-            y: BaseField_BLS12_377 {
-                s: get_fixed_limbs(y),
-            },
-            z: BaseField_BLS12_377 {
-                s: get_fixed_limbs(z),
-            },
+            x: BaseField_BLS12_377 { s: get_fixed_limbs(x) },
+            y: BaseField_BLS12_377 { s: get_fixed_limbs(y) },
+            z: BaseField_BLS12_377 { s: get_fixed_limbs(z) },
        }
    }

    pub fn from_xy_limbs(value: &[u32]) -> Point_BLS12_377 {
        let l = value.len();
-        assert_eq!(l, 3 * BASE_LIMBS_BLS12_377, "length must be 3 * {}", BASE_LIMBS_BLS12_377);
+        assert_eq!(
+            l,
+            3 * BASE_LIMBS_BLS12_377,
+            "length must be 3 * {}",
+            BASE_LIMBS_BLS12_377
+        );
        Point_BLS12_377 {
            x: BaseField_BLS12_377 {
-                s: value[..BASE_LIMBS_BLS12_377].try_into().unwrap(),
+                s: value[..BASE_LIMBS_BLS12_377]
+                    .try_into()
+                    .unwrap(),
            },
            y: BaseField_BLS12_377 {
-                s: value[BASE_LIMBS_BLS12_377..BASE_LIMBS_BLS12_377 * 2].try_into().unwrap(),
+                s: value[BASE_LIMBS_BLS12_377..BASE_LIMBS_BLS12_377 * 2]
+                    .try_into()
+                    .unwrap(),
            },
            z: BaseField_BLS12_377 {
-                s: value[BASE_LIMBS_BLS12_377 * 2..].try_into().unwrap(),
+                s: value[BASE_LIMBS_BLS12_377 * 2..]
+                    .try_into()
+                    .unwrap(),
            },
        }
    }
@@ -273,16 +324,21 @@ impl Point_BLS12_377 {
    pub fn to_affine(&self) -> PointAffineNoInfinity_BLS12_377 {
        let ark_affine = self.to_ark_affine();
        PointAffineNoInfinity_BLS12_377 {
-            x: BaseField_BLS12_377::from_ark(ark_affine.x.into_repr()),
-            y: BaseField_BLS12_377::from_ark(ark_affine.y.into_repr()),
+            x: BaseField_BLS12_377::from_ark(
+                ark_affine
+                    .x
+                    .into_repr(),
+            ),
+            y: BaseField_BLS12_377::from_ark(
+                ark_affine
+                    .y
+                    .into_repr(),
+            ),
        }
    }

    pub fn to_xy_strip_z(&self) -> PointAffineNoInfinity_BLS12_377 {
-        PointAffineNoInfinity_BLS12_377 {
-            x: self.x,
-            y: self.y,
-        }
+        PointAffineNoInfinity_BLS12_377 { x: self.x, y: self.y }
    }
 }

@@ -294,12 +350,9 @@ impl ScalarField_BLS12_377 {
    }
 }

-
 #[cfg(test)]
 mod tests {
-    use ark_bls12_377::{Fr as Fr_BLS12_377};
-
-    use crate::{utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec}, curves::bls12_377::{Point_BLS12_377, ScalarField_BLS12_377}};
+    use crate::curves::bls12_377::{Point_BLS12_377, ScalarField_BLS12_377};

    #[test]
    fn test_ark_scalar_convert() {
@@ -329,4 +382,4 @@ mod tests {
        );
        assert!(left != right);
    }
-}
+}
--- a/src/curves/bls12_381.rs
+++ b/src/curves/bls12_381.rs
@@ -1,16 +1,13 @@
-use std::ffi::c_uint;
-
-use ark_bls12_381::{Fq as Fq_BLS12_381, Fr as Fr_BLS12_381, G1Affine as G1Affine_BLS12_381, G1Projective as G1Projective_BLS12_381};
-
+use crate::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
+use ark_bls12_381::{Fq as Fq_BLS12_381, G1Affine as G1Affine_BLS12_381, G1Projective as G1Projective_BLS12_381};
 use ark_ec::AffineCurve;
-use ark_ff::{BigInteger384, BigInteger256, PrimeField};
-use serde::{Serialize, Deserialize};
-use std::mem::transmute;
 use ark_ff::Field;
-use crate::{utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec}};
-
+use ark_ff::{BigInteger256, BigInteger384, PrimeField};
 use rustacuda_core::DeviceCopy;
 use rustacuda_derive::DeviceCopy;
+use serde::{Deserialize, Serialize};
+use std::ffi::c_uint;
+use std::mem::transmute;

 #[derive(Debug, PartialEq, Copy, Clone)]
 #[repr(C)]
@@ -28,9 +25,7 @@ impl<const NUM_LIMBS: usize> Default for Field_BLS12_381<NUM_LIMBS> {

 impl<const NUM_LIMBS: usize> Field_BLS12_381<NUM_LIMBS> {
    pub fn zero() -> Self {
-        Field_BLS12_381 {
-            s: [0u32; NUM_LIMBS],
-        }
+        Field_BLS12_381 { s: [0u32; NUM_LIMBS] }
    }

    pub fn one() -> Self {
@@ -42,7 +37,10 @@ impl<const NUM_LIMBS: usize> Field_BLS12_381<NUM_LIMBS> {
    fn to_bytes_le(&self) -> Vec<u8> {
        self.s
            .iter()
-            .map(|s| s.to_le_bytes().to_vec())
+            .map(|s| {
+                s.to_le_bytes()
+                    .to_vec()
+            })
            .flatten()
            .collect::<Vec<_>>()
    }
@@ -51,7 +49,9 @@ impl<const NUM_LIMBS: usize> Field_BLS12_381<NUM_LIMBS> {
 pub const BASE_LIMBS_BLS12_381: usize = 12;
 pub const SCALAR_LIMBS_BLS12_381: usize = 8;

+#[allow(non_camel_case_types)]
 pub type BaseField_BLS12_381 = Field_BLS12_381<BASE_LIMBS_BLS12_381>;
+#[allow(non_camel_case_types)]
 pub type ScalarField_BLS12_381 = Field_BLS12_381<SCALAR_LIMBS_BLS12_381>;

 impl Serialize for ScalarField_BLS12_381 {
@@ -59,7 +59,8 @@ impl Serialize for ScalarField_BLS12_381 {
    where
        S: serde::Serializer,
    {
-        self.s.serialize(serializer)
+        self.s
+            .serialize(serializer)
    }
 }

@@ -80,7 +81,9 @@ fn get_fixed_limbs<const NUM_LIMBS: usize>(val: &[u32]) -> [u32; NUM_LIMBS] {
            padded[..val.len()].copy_from_slice(&val);
            padded
        }
-        n if n == NUM_LIMBS => val.try_into().unwrap(),
+        n if n == NUM_LIMBS => val
+            .try_into()
+            .unwrap(),
        _ => panic!("slice has too many elements"),
    }
 }
@@ -97,7 +100,11 @@ impl BaseField_BLS12_381 {
    }

    pub fn to_ark(&self) -> BigInteger384 {
-        BigInteger384::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
+        BigInteger384::new(
+            u32_vec_to_u64_vec(&self.limbs())
+                .try_into()
+                .unwrap(),
+        )
    }

    pub fn from_ark(ark: BigInteger384) -> Self {
@@ -111,7 +118,11 @@ impl ScalarField_BLS12_381 {
    }

    pub fn to_ark(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
+        BigInteger256::new(
+            u32_vec_to_u64_vec(&self.limbs())
+                .try_into()
+                .unwrap(),
+        )
    }

    pub fn from_ark(ark: BigInteger256) -> Self {
@@ -156,25 +167,41 @@ impl Point_BLS12_381 {

    pub fn to_ark(&self) -> G1Projective_BLS12_381 {
        //TODO: generic conversion
-        self.to_ark_affine().into_projective()
+        self.to_ark_affine()
+            .into_projective()
    }

    pub fn to_ark_affine(&self) -> G1Affine_BLS12_381 {
        //TODO: generic conversion
-        use ark_ff::Field;
        use std::ops::Mul;
-        let proj_x_field = Fq_BLS12_381::from_le_bytes_mod_order(&self.x.to_bytes_le());
-        let proj_y_field = Fq_BLS12_381::from_le_bytes_mod_order(&self.y.to_bytes_le());
-        let proj_z_field = Fq_BLS12_381::from_le_bytes_mod_order(&self.z.to_bytes_le());
-        let inverse_z = proj_z_field.inverse().unwrap();
+        let proj_x_field = Fq_BLS12_381::from_le_bytes_mod_order(
+            &self
+                .x
+                .to_bytes_le(),
+        );
+        let proj_y_field = Fq_BLS12_381::from_le_bytes_mod_order(
+            &self
+                .y
+                .to_bytes_le(),
+        );
+        let proj_z_field = Fq_BLS12_381::from_le_bytes_mod_order(
+            &self
+                .z
+                .to_bytes_le(),
+        );
+        let inverse_z = proj_z_field
+            .inverse()
+            .unwrap();
        let aff_x = proj_x_field.mul(inverse_z);
        let aff_y = proj_y_field.mul(inverse_z);
        G1Affine_BLS12_381::new(aff_x, aff_y, false)
    }

    pub fn from_ark(ark: G1Projective_BLS12_381) -> Point_BLS12_381 {
-        use ark_ff::Field;
-        let z_inv = ark.z.inverse().unwrap();
+        let z_inv = ark
+            .z
+            .inverse()
+            .unwrap();
        let z_invsq = z_inv * z_inv;
        let z_invq3 = z_invsq * z_inv;
        Point_BLS12_381 {
@@ -216,17 +243,19 @@ impl PointAffineNoInfinity_BLS12_381 {
    ///From u32 limbs x,y
    pub fn from_limbs(x: &[u32], y: &[u32]) -> Self {
        PointAffineNoInfinity_BLS12_381 {
-            x: BaseField_BLS12_381 {
-                s: get_fixed_limbs(x),
-            },
-            y: BaseField_BLS12_381 {
-                s: get_fixed_limbs(y),
-            },
+            x: BaseField_BLS12_381 { s: get_fixed_limbs(x) },
+            y: BaseField_BLS12_381 { s: get_fixed_limbs(y) },
        }
    }

    pub fn limbs(&self) -> Vec<u32> {
-        [self.x.limbs(), self.y.limbs()].concat()
+        [
+            self.x
+                .limbs(),
+            self.y
+                .limbs(),
+        ]
+        .concat()
    }

    pub fn to_projective(&self) -> Point_BLS12_381 {
@@ -238,13 +267,31 @@ impl PointAffineNoInfinity_BLS12_381 {
    }

    pub fn to_ark(&self) -> G1Affine_BLS12_381 {
-        G1Affine_BLS12_381::new(Fq_BLS12_381::new(self.x.to_ark()), Fq_BLS12_381::new(self.y.to_ark()), false)
+        G1Affine_BLS12_381::new(
+            Fq_BLS12_381::new(
+                self.x
+                    .to_ark(),
+            ),
+            Fq_BLS12_381::new(
+                self.y
+                    .to_ark(),
+            ),
+            false,
+        )
    }

    pub fn to_ark_repr(&self) -> G1Affine_BLS12_381 {
        G1Affine_BLS12_381::new(
-            Fq_BLS12_381::from_repr(self.x.to_ark()).unwrap(),
-            Fq_BLS12_381::from_repr(self.y.to_ark()).unwrap(),
+            Fq_BLS12_381::from_repr(
+                self.x
+                    .to_ark(),
+            )
+            .unwrap(),
+            Fq_BLS12_381::from_repr(
+                self.y
+                    .to_ark(),
+            )
+            .unwrap(),
            false,
        )
    }
@@ -262,30 +309,35 @@ impl Point_BLS12_381 {

    pub fn from_limbs(x: &[u32], y: &[u32], z: &[u32]) -> Self {
        Point_BLS12_381 {
-            x: BaseField_BLS12_381 {
-                s: get_fixed_limbs(x),
-            },
-            y: BaseField_BLS12_381 {
-                s: get_fixed_limbs(y),
-            },
-            z: BaseField_BLS12_381 {
-                s: get_fixed_limbs(z),
-            },
+            x: BaseField_BLS12_381 { s: get_fixed_limbs(x) },
+            y: BaseField_BLS12_381 { s: get_fixed_limbs(y) },
+            z: BaseField_BLS12_381 { s: get_fixed_limbs(z) },
        }
    }

    pub fn from_xy_limbs(value: &[u32]) -> Point_BLS12_381 {
        let l = value.len();
-        assert_eq!(l, 3 * BASE_LIMBS_BLS12_381, "length must be 3 * {}", BASE_LIMBS_BLS12_381);
+        assert_eq!(
+            l,
+            3 * BASE_LIMBS_BLS12_381,
+            "length must be 3 * {}",
+            BASE_LIMBS_BLS12_381
+        );
        Point_BLS12_381 {
            x: BaseField_BLS12_381 {
-                s: value[..BASE_LIMBS_BLS12_381].try_into().unwrap(),
+                s: value[..BASE_LIMBS_BLS12_381]
+                    .try_into()
+                    .unwrap(),
            },
            y: BaseField_BLS12_381 {
-                s: value[BASE_LIMBS_BLS12_381..BASE_LIMBS_BLS12_381 * 2].try_into().unwrap(),
+                s: value[BASE_LIMBS_BLS12_381..BASE_LIMBS_BLS12_381 * 2]
+                    .try_into()
+                    .unwrap(),
            },
            z: BaseField_BLS12_381 {
-                s: value[BASE_LIMBS_BLS12_381 * 2..].try_into().unwrap(),
+                s: value[BASE_LIMBS_BLS12_381 * 2..]
+                    .try_into()
+                    .unwrap(),
            },
        }
    }
@@ -293,16 +345,21 @@ impl Point_BLS12_381 {
    pub fn to_affine(&self) -> PointAffineNoInfinity_BLS12_381 {
        let ark_affine = self.to_ark_affine();
        PointAffineNoInfinity_BLS12_381 {
-            x: BaseField_BLS12_381::from_ark(ark_affine.x.into_repr()),
-            y: BaseField_BLS12_381::from_ark(ark_affine.y.into_repr()),
+            x: BaseField_BLS12_381::from_ark(
+                ark_affine
+                    .x
+                    .into_repr(),
+            ),
+            y: BaseField_BLS12_381::from_ark(
+                ark_affine
+                    .y
+                    .into_repr(),
+            ),
        }
    }

    pub fn to_xy_strip_z(&self) -> PointAffineNoInfinity_BLS12_381 {
-        PointAffineNoInfinity_BLS12_381 {
-            x: self.x,
-            y: self.y,
-        }
+        PointAffineNoInfinity_BLS12_381 { x: self.x, y: self.y }
    }
 }

@@ -314,12 +371,10 @@ impl ScalarField_BLS12_381 {
    }
 }

-
 #[cfg(test)]
 mod tests {
-    use ark_bls12_381::{Fr as Fr_BLS12_381};

-    use crate::{utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec}, curves::bls12_381::{Point_BLS12_381, ScalarField_BLS12_381}};
+    use crate::curves::bls12_381::{Point_BLS12_381, ScalarField_BLS12_381};

    #[test]
    fn test_ark_scalar_convert() {
@@ -349,4 +404,4 @@ mod tests {
        );
        assert!(left != right);
    }
-}
+}
--- a/src/curves/bn254.rs
+++ b/src/curves/bn254.rs
@@ -1,15 +1,12 @@
-use std::ffi::c_uint;
-
-use ark_bn254::{Fq as Fq_BN254, Fr as Fr_BN254, G1Affine as G1Affine_BN254, G1Projective as G1Projective_BN254};
-
+use crate::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
+use ark_bn254::{Fq as Fq_BN254, G1Affine as G1Affine_BN254, G1Projective as G1Projective_BN254};
 use ark_ec::AffineCurve;
-use ark_ff::{BigInteger256, PrimeField};
-use std::mem::transmute;
 use ark_ff::Field;
-use crate::{utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec}};
-
+use ark_ff::{BigInteger256, PrimeField};
 use rustacuda_core::DeviceCopy;
 use rustacuda_derive::DeviceCopy;
+use std::ffi::c_uint;
+use std::mem::transmute;

 #[derive(Debug, PartialEq, Copy, Clone)]
 #[repr(C)]
@@ -27,9 +24,7 @@ impl<const NUM_LIMBS: usize> Default for Field_BN254<NUM_LIMBS> {

 impl<const NUM_LIMBS: usize> Field_BN254<NUM_LIMBS> {
    pub fn zero() -> Self {
-        Field_BN254 {
-            s: [0u32; NUM_LIMBS],
-        }
+        Field_BN254 { s: [0u32; NUM_LIMBS] }
    }

    pub fn one() -> Self {
@@ -41,7 +36,10 @@ impl<const NUM_LIMBS: usize> Field_BN254<NUM_LIMBS> {
    fn to_bytes_le(&self) -> Vec<u8> {
        self.s
            .iter()
-            .map(|s| s.to_le_bytes().to_vec())
+            .map(|s| {
+                s.to_le_bytes()
+                    .to_vec()
+            })
            .flatten()
            .collect::<Vec<_>>()
    }
@@ -50,7 +48,9 @@ impl<const NUM_LIMBS: usize> Field_BN254<NUM_LIMBS> {
 pub const BASE_LIMBS_BN254: usize = 8;
 pub const SCALAR_LIMBS_BN254: usize = 8;

+#[allow(non_camel_case_types)]
 pub type BaseField_BN254 = Field_BN254<BASE_LIMBS_BN254>;
+#[allow(non_camel_case_types)]
 pub type ScalarField_BN254 = Field_BN254<SCALAR_LIMBS_BN254>;

 fn get_fixed_limbs<const NUM_LIMBS: usize>(val: &[u32]) -> [u32; NUM_LIMBS] {
@@ -60,7 +60,9 @@ fn get_fixed_limbs<const NUM_LIMBS: usize>(val: &[u32]) -> [u32; NUM_LIMBS] {
            padded[..val.len()].copy_from_slice(&val);
            padded
        }
-        n if n == NUM_LIMBS => val.try_into().unwrap(),
+        n if n == NUM_LIMBS => val
+            .try_into()
+            .unwrap(),
        _ => panic!("slice has too many elements"),
    }
 }
@@ -71,7 +73,11 @@ impl ScalarField_BN254 {
    }

    pub fn to_ark(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
+        BigInteger256::new(
+            u32_vec_to_u64_vec(&self.limbs())
+                .try_into()
+                .unwrap(),
+        )
    }

    pub fn from_ark(ark: BigInteger256) -> Self {
@@ -116,25 +122,41 @@ impl Point_BN254 {

    pub fn to_ark(&self) -> G1Projective_BN254 {
        //TODO: generic conversion
-        self.to_ark_affine().into_projective()
+        self.to_ark_affine()
+            .into_projective()
    }

    pub fn to_ark_affine(&self) -> G1Affine_BN254 {
        //TODO: generic conversion
-        use ark_ff::Field;
        use std::ops::Mul;
-        let proj_x_field = Fq_BN254::from_le_bytes_mod_order(&self.x.to_bytes_le());
-        let proj_y_field = Fq_BN254::from_le_bytes_mod_order(&self.y.to_bytes_le());
-        let proj_z_field = Fq_BN254::from_le_bytes_mod_order(&self.z.to_bytes_le());
-        let inverse_z = proj_z_field.inverse().unwrap();
+        let proj_x_field = Fq_BN254::from_le_bytes_mod_order(
+            &self
+                .x
+                .to_bytes_le(),
+        );
+        let proj_y_field = Fq_BN254::from_le_bytes_mod_order(
+            &self
+                .y
+                .to_bytes_le(),
+        );
+        let proj_z_field = Fq_BN254::from_le_bytes_mod_order(
+            &self
+                .z
+                .to_bytes_le(),
+        );
+        let inverse_z = proj_z_field
+            .inverse()
+            .unwrap();
        let aff_x = proj_x_field.mul(inverse_z);
        let aff_y = proj_y_field.mul(inverse_z);
        G1Affine_BN254::new(aff_x, aff_y, false)
    }

    pub fn from_ark(ark: G1Projective_BN254) -> Point_BN254 {
-        use ark_ff::Field;
-        let z_inv = ark.z.inverse().unwrap();
+        let z_inv = ark
+            .z
+            .inverse()
+            .unwrap();
        let z_invsq = z_inv * z_inv;
        let z_invq3 = z_invsq * z_inv;
        Point_BN254 {
@@ -176,17 +198,19 @@ impl PointAffineNoInfinity_BN254 {
    ///From u32 limbs x,y
    pub fn from_limbs(x: &[u32], y: &[u32]) -> Self {
        PointAffineNoInfinity_BN254 {
-            x: BaseField_BN254 {
-                s: get_fixed_limbs(x),
-            },
-            y: BaseField_BN254 {
-                s: get_fixed_limbs(y),
-            },
+            x: BaseField_BN254 { s: get_fixed_limbs(x) },
+            y: BaseField_BN254 { s: get_fixed_limbs(y) },
        }
    }

    pub fn limbs(&self) -> Vec<u32> {
-        [self.x.limbs(), self.y.limbs()].concat()
+        [
+            self.x
+                .limbs(),
+            self.y
+                .limbs(),
+        ]
+        .concat()
    }

    pub fn to_projective(&self) -> Point_BN254 {
@@ -198,13 +222,31 @@ impl PointAffineNoInfinity_BN254 {
    }

    pub fn to_ark(&self) -> G1Affine_BN254 {
-        G1Affine_BN254::new(Fq_BN254::new(self.x.to_ark()), Fq_BN254::new(self.y.to_ark()), false)
+        G1Affine_BN254::new(
+            Fq_BN254::new(
+                self.x
+                    .to_ark(),
+            ),
+            Fq_BN254::new(
+                self.y
+                    .to_ark(),
+            ),
+            false,
+        )
    }

    pub fn to_ark_repr(&self) -> G1Affine_BN254 {
        G1Affine_BN254::new(
-            Fq_BN254::from_repr(self.x.to_ark()).unwrap(),
-            Fq_BN254::from_repr(self.y.to_ark()).unwrap(),
+            Fq_BN254::from_repr(
+                self.x
+                    .to_ark(),
+            )
+            .unwrap(),
+            Fq_BN254::from_repr(
+                self.y
+                    .to_ark(),
+            )
+            .unwrap(),
            false,
        )
    }
@@ -222,15 +264,9 @@ impl Point_BN254 {

    pub fn from_limbs(x: &[u32], y: &[u32], z: &[u32]) -> Self {
        Point_BN254 {
-            x: BaseField_BN254 {
-                s: get_fixed_limbs(x),
-            },
-            y: BaseField_BN254 {
-                s: get_fixed_limbs(y),
-            },
-            z: BaseField_BN254 {
-                s: get_fixed_limbs(z),
-            },
+            x: BaseField_BN254 { s: get_fixed_limbs(x) },
+            y: BaseField_BN254 { s: get_fixed_limbs(y) },
+            z: BaseField_BN254 { s: get_fixed_limbs(z) },
        }
    }

@@ -239,13 +275,19 @@ impl Point_BN254 {
        assert_eq!(l, 3 * BASE_LIMBS_BN254, "length must be 3 * {}", BASE_LIMBS_BN254);
        Point_BN254 {
            x: BaseField_BN254 {
-                s: value[..BASE_LIMBS_BN254].try_into().unwrap(),
+                s: value[..BASE_LIMBS_BN254]
+                    .try_into()
+                    .unwrap(),
            },
            y: BaseField_BN254 {
-                s: value[BASE_LIMBS_BN254..BASE_LIMBS_BN254 * 2].try_into().unwrap(),
+                s: value[BASE_LIMBS_BN254..BASE_LIMBS_BN254 * 2]
+                    .try_into()
+                    .unwrap(),
            },
            z: BaseField_BN254 {
-                s: value[BASE_LIMBS_BN254 * 2..].try_into().unwrap(),
+                s: value[BASE_LIMBS_BN254 * 2..]
+                    .try_into()
+                    .unwrap(),
            },
        }
    }
@@ -253,16 +295,21 @@ impl Point_BN254 {
    pub fn to_affine(&self) -> PointAffineNoInfinity_BN254 {
        let ark_affine = self.to_ark_affine();
        PointAffineNoInfinity_BN254 {
-            x: BaseField_BN254::from_ark(ark_affine.x.into_repr()),
-            y: BaseField_BN254::from_ark(ark_affine.y.into_repr()),
+            x: BaseField_BN254::from_ark(
+                ark_affine
+                    .x
+                    .into_repr(),
+            ),
+            y: BaseField_BN254::from_ark(
+                ark_affine
+                    .y
+                    .into_repr(),
+            ),
        }
    }

    pub fn to_xy_strip_z(&self) -> PointAffineNoInfinity_BN254 {
-        PointAffineNoInfinity_BN254 {
-            x: self.x,
-            y: self.y,
-        }
+        PointAffineNoInfinity_BN254 { x: self.x, y: self.y }
    }
 }

@@ -274,12 +321,10 @@ impl ScalarField_BN254 {
    }
 }

-
 #[cfg(test)]
 mod tests {
-    use ark_bn254::{Fr as Fr_BN254};

-    use crate::{utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec}, curves::bn254::{Point_BN254, ScalarField_BN254}};
+    use crate::curves::bn254::{Point_BN254, ScalarField_BN254};

    #[test]
    fn test_ark_scalar_convert() {
@@ -302,11 +347,7 @@ mod tests {
        assert_eq!(left, right);
        let right = Point_BN254::from_limbs(&[0; 8], &[2, 0, 0, 0, 0, 0, 0, 0], &[0; 8]);
        assert_eq!(left, right);
-        let right = Point_BN254::from_limbs(
-            &[2, 0, 0, 0, 0, 0, 0, 0],
-            &[0; 8],
-            &[1, 0, 0, 0, 0, 0, 0, 0],
-        );
+        let right = Point_BN254::from_limbs(&[2, 0, 0, 0, 0, 0, 0, 0], &[0; 8], &[1, 0, 0, 0, 0, 0, 0, 0]);
        assert!(left != right);
    }
-}
+}
--- a/src/curves/mod.rs
+++ b/src/curves/mod.rs
@@ -1,3 +1,3 @@
-pub mod bls12_381;
 pub mod bls12_377;
+pub mod bls12_381;
 pub mod bn254;
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,5 @@
+pub mod curves;
 pub mod test_bls12_377;
 pub mod test_bls12_381;
 pub mod test_bn254;
 pub mod utils;
-pub mod curves;
--- a/src/test_bls12_377.rs
+++ b/src/test_bls12_377.rs
--- a/src/test_bls12_381.rs
+++ b/src/test_bls12_381.rs
--- a/src/test_bn254.rs
+++ b/src/test_bn254.rs
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -1,5 +1,5 @@
-use rand::RngCore;
 use rand::rngs::StdRng;
+use rand::RngCore;
 use rand::SeedableRng;

 pub fn from_limbs<T>(limbs: Vec<u32>, chunk_size: usize, f: fn(&[u32]) -> T) -> Vec<T> {
@@ -33,7 +33,8 @@ pub fn u64_vec_to_u32_vec(arr_u64: &[u64]) -> Vec<u32> {
    arr_u32
 }

-pub fn get_rng(seed: Option<u64>) -> Box<dyn RngCore> { //TOOD: this func is universal
+pub fn get_rng(seed: Option<u64>) -> Box<dyn RngCore> {
+    //TOOD: this func is universal
    let rng: Box<dyn RngCore> = match seed {
        Some(seed) => Box::new(StdRng::seed_from_u64(seed)),
        None => Box::new(rand::thread_rng()),
@@ -45,7 +46,7 @@ pub fn get_rng(seed: Option<u64>) -> Box<dyn RngCore> { //TOOD: this func is uni
 mod tests {
    use ark_ff::BigInteger256;

-    use crate::curves::bls12_381::{ScalarField_BLS12_381 as ScalarField};
+    use crate::curves::bls12_381::ScalarField_BLS12_381 as ScalarField;

    use super::*;

@@ -54,7 +55,9 @@ mod tests {
        let arr_u32 = [1, 0x0fffffff, 3, 0x2fffffff, 5, 0x4fffffff, 7, 0x6fffffff];

        let s = ScalarField::from_ark_transmute(BigInteger256::new(
-            u32_vec_to_u64_vec(&arr_u32).try_into().unwrap(),
+            u32_vec_to_u64_vec(&arr_u32)
+                .try_into()
+                .unwrap(),
        ))
        .limbs();