Specify curve as bn254

integration with V2
Merge branch 'V2' into sumcheck-experiments
2026-01-11 00:18:08 -05:00 · 2024-08-13 17:59:48 +09:00 · 2024-04-17 11:19:38 +03:00 · 2024-04-15 17:43:10 +03:00 · 2024-04-14 16:31:49 +03:00 · 2024-04-14 18:23:43 +07:00
331 changed files with 276705 additions and 3443 deletions
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -4,7 +4,7 @@ on:
  pull_request:
    branches:
      - main
-      - dev
+      - V2

 jobs:
  spelling-checker:
--- a/.github/workflows/cpp_cuda.yml
+++ b/.github/workflows/cpp_cuda.yml
@@ -4,11 +4,11 @@ on:
  pull_request:
    branches:
      - main
-      - dev
+      - V2
  push:
    branches:
      - main
-      - dev
+      - V2

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
@@ -29,7 +29,7 @@ jobs:
      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi

-  test-linux:
+  test-linux-curve:
    name: Test on Linux
    runs-on: [self-hosted, Linux, X64, icicle]
    needs: [check-changed-files, check-format]
@@ -39,14 +39,36 @@ jobs:
    steps:
    - name: Checkout Repo
      uses: actions/checkout@v4
-    - name: Build
+    - name: Build curve
      working-directory: ./icicle
      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: |
-        mkdir -p build
-        cmake -DBUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release -DCURVE=${{ matrix.curve }} -DG2_DEFINED=ON -S . -B build
-        cmake --build build
-    - name: Run C++ Tests
-      working-directory: ./icicle/build
+        mkdir -p build && rm -rf build/*
+        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve }} -DG2=ON -S . -B build
+        cmake --build build -j
+    - name: Run C++ curve Tests
+      working-directory: ./icicle/build/tests
      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: ctest
+
+  test-linux-field:
+    name: Test on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: [check-changed-files, check-format]
+    strategy:
+      matrix:
+        field: [babybear]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v4
+    - name: Build field
+      working-directory: ./icicle
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        mkdir -p build && rm -rf build/*
+        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DFIELD=${{ matrix.field }} -DEXT_FIELD=ON -S . -B build
+        cmake --build build -j
+    - name: Run C++ field Tests
+      working-directory: ./icicle/build/tests
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: ctest
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -11,11 +11,11 @@ on:
  pull_request:
    branches:
      - main
-      - dev
+      - V2
  push:
    branches:
      - main
-      - dev
+      - V2

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
--- a/.github/workflows/golang.yml
+++ b/.github/workflows/golang.yml
@@ -4,11 +4,11 @@ on:
  pull_request:
    branches:
      - main
-      - dev
+      - V2
  push:
    branches:
      - main
-      - dev
+      - V2

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
@@ -50,13 +50,15 @@ jobs:
    - name: Build
      working-directory: ./wrappers/golang
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ./build.sh ${{ matrix.curve }} ON ON # builds a single curve with G2 and ECNTT enabled
+      run: ./build.sh ${{ matrix.curve }} -g2 -ecntt # builds a single curve with G2 and ECNTT enabled
    - name: Upload ICICLE lib artifacts
      uses: actions/upload-artifact@v4
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
      with:
        name: icicle-builds-${{ matrix.curve }}-${{ github.workflow }}-${{ github.sha }}
-        path: icicle/build/libingo_${{ matrix.curve }}.a
+        path: |
+          icicle/build/src/curves/libingo_curve_${{ matrix.curve }}.a
+          icicle/build/src/fields/libingo_field_${{ matrix.curve }}.a
        retention-days: 1
  
  test-linux:
@@ -74,7 +76,7 @@ jobs:
      uses: actions/download-artifact@v4
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
      with:
-        path: ./icicle/build/
+        path: ./icicle/build/src
        merge-multiple: true
    - name: Run Tests
      working-directory: ./wrappers/golang
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -4,11 +4,11 @@ on:
  pull_request:
    branches:
      - main
-      - dev
+      - V2
  push:
    branches:
      - main
-      - dev
+      - V2

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
@@ -60,7 +60,11 @@ jobs:
      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
      # Running tests from the root workspace will run all workspace members' tests by default
      # We need to limit the number of threads to avoid running out of memory on weaker machines
-      run: cargo test --release --verbose --features=g2 -- --test-threads=2
+      run: cargo test --workspace --exclude icicle-babybear --release --verbose --features=g2 -- --test-threads=2
+    - name: Run baby bear tests
+      working-directory: ./wrappers/rust/icicle-fields/icicle-babybear
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: cargo test --release --verbose

  build-windows:
    name: Build on Windows
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,6 @@
 **/Cargo.lock
 **/icicle/build/
 **/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
-**/build
+**/build*
 **/icicle/appUtils/large_ntt/work
 icicle/appUtils/large_ntt/work/test_ntt
--- a/docs/docs/icicle/polynomials/ffi.uml
+++ b/docs/docs/icicle/polynomials/ffi.uml
@@ -0,0 +1,27 @@
+@startuml
+skinparam componentStyle uml2
+
+' Define Components
+component "C++ Template\nComponent" as CppTemplate {
+  [Parameterizable Interface]
+}
+component "C API Wrapper\nComponent" as CApiWrapper {
+  [C API Interface]
+}
+component "Rust Code\nComponent" as RustCode {
+  [Macro Interface\n(Template Instantiation)]
+}
+
+' Define Artifact
+artifact "Static Library\n«artifact»" as StaticLib
+
+' Connections
+CppTemplate -down-> CApiWrapper : Instantiates
+CApiWrapper .down.> StaticLib : Compiles into
+RustCode -left-> StaticLib : Links against\nand calls via FFI
+
+' Notes
+note right of CppTemplate : Generic C++\ntemplate implementation
+note right of CApiWrapper : Exposes C API for FFI\nto Rust/Go
+note right of RustCode : Uses macros to\ninstantiate templates
+@enduml
--- a/docs/docs/icicle/polynomials/hw_backends.uml
+++ b/docs/docs/icicle/polynomials/hw_backends.uml
@@ -0,0 +1,86 @@
+@startuml
+
+' Define Interface for Polynomial Backend Operations
+interface IPolynomialBackend {
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+' Define Interface for Polynomial Context (State Management)
+interface IPolynomialContext {
+    +initFromCoeffs()
+    +initFromEvals()
+    +getCoeffs()
+    +getEvals()
+}
+
+' PolynomialAPI now uses two strategies: Backend and Context
+class PolynomialAPI {
+    -backendStrategy: IPolynomialBackend
+    -contextStrategy: IPolynomialContext
+    -setBackendStrategy(IPolynomialBackend)
+    -setContextStrategy(IPolynomialContext)
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+' Backend Implementations
+class GPUPolynomialBackend implements IPolynomialBackend {
+    #gpuResources: Resource
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+class ZPUPolynomialBackend implements IPolynomialBackend {
+    #zpuResources: Resource
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+class TracerPolynomialBackend implements IPolynomialBackend {
+    #traceData: Data
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+' Context Implementations (Placeholder for actual implementation)
+class GPUContext implements IPolynomialContext {
+    +initFromCoeffs()
+    +initFromEvals()
+    +getCoeffs()
+    +getEvals()
+}
+
+class ZPUContext implements IPolynomialContext {
+    +initFromCoeffs()
+    +initFromEvals()
+    +getCoeffs()
+    +getEvals()
+}
+
+class TracerContext implements IPolynomialContext {
+    +initFromCoeffs()
+    +initFromEvals()
+    +getCoeffs()
+    +getEvals()
+}
+
+' Relationships
+PolynomialAPI o-- IPolynomialBackend : uses
+PolynomialAPI o-- IPolynomialContext : uses
+@enduml
--- a/examples/c++/polynomial_multiplication/example.cu
+++ b/examples/c++/polynomial_multiplication/example.cu
@@ -89,7 +89,6 @@ int main(int argc, char** argv)
        true,  // is_a_on_device
        true,  // is_b_on_device
        true,  // is_result_on_device
-        false, // is_montgomery
        false  // is_async
      };
      CHK_IF_RETURN(vec_ops::Mul(GpuA, GpuB, NTT_SIZE, config, MulGpu));
--- a/examples/rust/msm/Cargo.toml
+++ b/examples/rust/msm/Cargo.toml
@@ -8,12 +8,11 @@ icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
 icicle-core = { path = "../../../wrappers/rust/icicle-core" }
 icicle-bn254 = { path = "../../../wrappers/rust/icicle-curves/icicle-bn254", features = ["g2"] }
 icicle-bls12-377 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-377" }
-ark-bn254 = { version = "0.4.0", optional = true}
-ark-bls12-377 = { version = "0.4.0", optional = true}
-ark-ec = { version = "0.4.0", optional = true}
+ark-bn254 = { version = "0.4.0", optional = true }
+ark-bls12-377 = { version = "0.4.0", optional = true }
+ark-ec = { version = "0.4.0", optional = true }
 clap = { version = "<=4.4.12", features = ["derive"] }

 [features]
 arkworks = ["ark-bn254", "ark-bls12-377", "ark-ec", "icicle-core/arkworks", "icicle-bn254/arkworks", "icicle-bls12-377/arkworks"]
 profile = []
-g2 = []
--- a/examples/rust/msm/src/main.rs
+++ b/examples/rust/msm/src/main.rs
@@ -4,7 +4,10 @@ use icicle_bls12_377::curve::{
    CurveCfg as BLS12377CurveCfg, G1Projective as BLS12377G1Projective, ScalarCfg as BLS12377ScalarCfg,
 };

-use icicle_cuda_runtime::{memory::HostOrDeviceSlice, stream::CudaStream};
+use icicle_cuda_runtime::{
+    memory::{DeviceVec, HostSlice},
+    stream::CudaStream,
+};

 use icicle_core::{curve::Curve, msm, traits::GenerateRandom};

@@ -57,18 +60,18 @@ fn main() {
            log_size, size
        );
        // Setting Bn254 points and scalars
-        let points = HostOrDeviceSlice::Host(upper_points[..size].to_vec());
-        let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
-        let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
+        let points = HostSlice::from_slice(&upper_points[..size]);
+        let g2_points = HostSlice::from_slice(&g2_upper_points[..size]);
+        let scalars = HostSlice::from_slice(&upper_scalars[..size]);

        // Setting bls12377 points and scalars
        // let points_bls12377 = &upper_points_bls12377[..size];
-        let points_bls12377 = HostOrDeviceSlice::Host(upper_points_bls12377[..size].to_vec()); //  &upper_points_bls12377[..size];
-        let scalars_bls12377 = HostOrDeviceSlice::Host(upper_scalars_bls12377[..size].to_vec());
+        let points_bls12377 = HostSlice::from_slice(&upper_points_bls12377[..size]); //  &upper_points_bls12377[..size];
+        let scalars_bls12377 = HostSlice::from_slice(&upper_scalars_bls12377[..size]);

        println!("Configuring bn254 MSM...");
-        let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
-        let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+        let mut msm_results = DeviceVec::<G1Projective>::cuda_malloc(1).unwrap();
+        let mut g2_msm_results = DeviceVec::<G2Projective>::cuda_malloc(1).unwrap();
        let stream = CudaStream::create().unwrap();
        let g2_stream = CudaStream::create().unwrap();
        let mut cfg = msm::MSMConfig::default();
@@ -82,8 +85,7 @@ fn main() {
        g2_cfg.is_async = true;

        println!("Configuring bls12377 MSM...");
-        let mut msm_results_bls12377: HostOrDeviceSlice<'_, BLS12377G1Projective> =
-            HostOrDeviceSlice::cuda_malloc(1).unwrap();
+        let mut msm_results_bls12377 = DeviceVec::<BLS12377G1Projective>::cuda_malloc(1).unwrap();
        let stream_bls12377 = CudaStream::create().unwrap();
        let mut cfg_bls12377 = msm::MSMConfig::default();
        cfg_bls12377
@@ -94,7 +96,7 @@ fn main() {
        println!("Executing bn254 MSM on device...");
        #[cfg(feature = "profile")]
        let start = Instant::now();
-        msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
+        msm::msm(scalars, points, &cfg, &mut msm_results[..]).unwrap();
        #[cfg(feature = "profile")]
        println!(
            "ICICLE BN254 MSM on size 2^{log_size} took: {} ms",
@@ -102,16 +104,16 @@ fn main() {
                .elapsed()
                .as_millis()
        );
-        msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();
+        msm::msm(scalars, g2_points, &g2_cfg, &mut g2_msm_results[..]).unwrap();

        println!("Executing bls12377 MSM on device...");
        #[cfg(feature = "profile")]
        let start = Instant::now();
        msm::msm(
-            &scalars_bls12377,
-            &points_bls12377,
+            scalars_bls12377,
+            points_bls12377,
            &cfg_bls12377,
-            &mut msm_results_bls12377,
+            &mut msm_results_bls12377[..],
        )
        .unwrap();
        #[cfg(feature = "profile")]
@@ -134,10 +136,10 @@ fn main() {
            .synchronize()
            .unwrap();
        msm_results
-            .copy_to_host(&mut msm_host_result[..])
+            .copy_to_host(HostSlice::from_mut_slice(&mut msm_host_result[..]))
            .unwrap();
        g2_msm_results
-            .copy_to_host(&mut g2_msm_host_result[..])
+            .copy_to_host(HostSlice::from_mut_slice(&mut g2_msm_host_result[..]))
            .unwrap();
        println!("bn254 result: {:#?}", msm_host_result);
        println!("G2 bn254 result: {:#?}", g2_msm_host_result);
@@ -146,7 +148,7 @@ fn main() {
            .synchronize()
            .unwrap();
        msm_results_bls12377
-            .copy_to_host(&mut msm_host_result_bls12377[..])
+            .copy_to_host(HostSlice::from_mut_slice(&mut msm_host_result_bls12377[..]))
            .unwrap();
        println!("bls12377 result: {:#?}", msm_host_result_bls12377);

@@ -154,23 +156,19 @@ fn main() {
        {
            println!("Checking against arkworks...");
            let ark_points: Vec<Bn254G1Affine> = points
-                .as_slice()
                .iter()
                .map(|&point| point.to_ark())
                .collect();
            let ark_scalars: Vec<Bn254Fr> = scalars
-                .as_slice()
                .iter()
                .map(|scalar| scalar.to_ark())
                .collect();

            let ark_points_bls12377: Vec<Bls12377G1Affine> = points_bls12377
-                .as_slice()
                .iter()
                .map(|point| point.to_ark())
                .collect();
            let ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377
-                .as_slice()
                .iter()
                .map(|scalar| scalar.to_ark())
                .collect();
--- a/examples/rust/ntt/src/main.rs
+++ b/examples/rust/ntt/src/main.rs
@@ -2,7 +2,11 @@ use icicle_bn254::curve::{ScalarCfg, ScalarField};

 use icicle_bls12_377::curve::{ScalarCfg as BLS12377ScalarCfg, ScalarField as BLS12377ScalarField};

-use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSlice, stream::CudaStream};
+use icicle_cuda_runtime::{
+    device_context::DeviceContext,
+    memory::{DeviceVec, HostSlice},
+    stream::CudaStream,
+};

 use icicle_core::{
    ntt::{self, NTT},
@@ -41,14 +45,13 @@ fn main() {
    );
    // Setting Bn254 points and scalars
    println!("Generating random inputs on host for bn254...");
-    let scalars = HostOrDeviceSlice::Host(ScalarCfg::generate_random(size));
-    let mut ntt_results: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::cuda_malloc(size).unwrap();
+    let scalars = ScalarCfg::generate_random(size);
+    let mut ntt_results = DeviceVec::<ScalarField>::cuda_malloc(size).unwrap();

    // Setting bls12377 points and scalars
    println!("Generating random inputs on host for bls12377...");
-    let scalars_bls12377 = HostOrDeviceSlice::Host(BLS12377ScalarCfg::generate_random(size));
-    let mut ntt_results_bls12377: HostOrDeviceSlice<'_, BLS12377ScalarField> =
-        HostOrDeviceSlice::cuda_malloc(size).unwrap();
+    let scalars_bls12377 = BLS12377ScalarCfg::generate_random(size);
+    let mut ntt_results_bls12377 = DeviceVec::<BLS12377ScalarField>::cuda_malloc(size).unwrap();

    println!("Setting up bn254 Domain...");
    let icicle_omega = <Bn254Fr as FftField>::get_root_of_unity(
@@ -86,7 +89,13 @@ fn main() {
    println!("Executing bn254 NTT on device...");
    #[cfg(feature = "profile")]
    let start = Instant::now();
-    ntt::ntt(&scalars, ntt::NTTDir::kForward, &cfg, &mut ntt_results).unwrap();
+    ntt::ntt(
+        HostSlice::from_slice(&scalars),
+        ntt::NTTDir::kForward,
+        &cfg,
+        &mut ntt_results[..],
+    )
+    .unwrap();
    #[cfg(feature = "profile")]
    println!(
        "ICICLE BN254 NTT on size 2^{log_size} took: {} μs",
@@ -99,10 +108,10 @@ fn main() {
    #[cfg(feature = "profile")]
    let start = Instant::now();
    ntt::ntt(
-        &scalars_bls12377,
+        HostSlice::from_slice(&scalars_bls12377),
        ntt::NTTDir::kForward,
        &cfg_bls12377,
-        &mut ntt_results_bls12377,
+        &mut ntt_results_bls12377[..],
    )
    .unwrap();
    #[cfg(feature = "profile")]
@@ -119,7 +128,7 @@ fn main() {
        .unwrap();
    let mut host_bn254_results = vec![ScalarField::zero(); size];
    ntt_results
-        .copy_to_host(&mut host_bn254_results[..])
+        .copy_to_host(HostSlice::from_mut_slice(&mut host_bn254_results[..]))
        .unwrap();

    stream_bls12377
@@ -127,19 +136,17 @@ fn main() {
        .unwrap();
    let mut host_bls12377_results = vec![BLS12377ScalarField::zero(); size];
    ntt_results_bls12377
-        .copy_to_host(&mut host_bls12377_results[..])
+        .copy_to_host(HostSlice::from_mut_slice(&mut host_bls12377_results[..]))
        .unwrap();

    println!("Checking against arkworks...");
    let mut ark_scalars: Vec<Bn254Fr> = scalars
-        .as_slice()
        .iter()
        .map(|scalar| scalar.to_ark())
        .collect();
    let bn254_domain = <Radix2EvaluationDomain<Bn254Fr> as EvaluationDomain<Bn254Fr>>::new(size).unwrap();

    let mut ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377
-        .as_slice()
        .iter()
        .map(|scalar| scalar.to_ark())
        .collect();
--- a/examples/rust/poseidon/src/main.rs
+++ b/examples/rust/poseidon/src/main.rs
@@ -4,7 +4,7 @@ use icicle_cuda_runtime::device_context::DeviceContext;

 use icicle_core::poseidon::{load_optimized_poseidon_constants, poseidon_hash_many, PoseidonConfig};
 use icicle_core::traits::FieldImpl;
-use icicle_cuda_runtime::memory::HostOrDeviceSlice;
+use icicle_cuda_runtime::memory::HostSlice;

 #[cfg(feature = "profile")]
 use std::time::Instant;
@@ -25,23 +25,29 @@ fn main() {

    println!("Running Icicle Examples: Rust Poseidon Hash");
    let arity = 2u32;
-    println!("---------------------- Loading optimized Poseidon constants for arity={} ------------------------", arity);
+    println!(
+        "---------------------- Loading optimized Poseidon constants for arity={} ------------------------",
+        arity
+    );
    let ctx = DeviceContext::default();
    let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
    let config = PoseidonConfig::default();

-    println!("---------------------- Input size 2^{}={} ------------------------", size, test_size);
-    let inputs = vec![F::one(); test_size * arity as usize];
-    let outputs = vec![F::zero(); test_size];
-    let mut input_slice = HostOrDeviceSlice::on_host(inputs);
-    let mut output_slice = HostOrDeviceSlice::on_host(outputs);
+    println!(
+        "---------------------- Input size 2^{}={} ------------------------",
+        size, test_size
+    );
+    let mut inputs = vec![F::one(); test_size * arity as usize];
+    let mut outputs = vec![F::zero(); test_size];
+    let input_slice = HostSlice::from_mut_slice(&mut inputs);
+    let output_slice = HostSlice::from_mut_slice(&mut outputs);

    println!("Executing BLS12-381 Poseidon Hash on device...");
    #[cfg(feature = "profile")]
    let start = Instant::now();
    poseidon_hash_many::<F>(
-        &mut input_slice,
-        &mut output_slice,
+        input_slice,
+        output_slice,
        test_size as u32,
        arity as u32,
        &constants,
@@ -49,5 +55,10 @@ fn main() {
    )
    .unwrap();
    #[cfg(feature = "profile")]
-    println!("ICICLE BLS12-381 Poseidon Hash on size 2^{size} took: {} μs", start.elapsed().as_micros());
-}
+    println!(
+        "ICICLE BLS12-381 Poseidon Hash on size 2^{size} took: {} μs",
+        start
+            .elapsed()
+            .as_micros()
+    );
+}
--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -1,169 +1,59 @@
 cmake_minimum_required(VERSION 3.18)

-# GoogleTest requires at least C++14
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
-set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-
-if("$ENV{ICICLE_PIC}" STREQUAL "OFF" OR ICICLE_PIC STREQUAL "OFF")
-  message(WARNING "Note that PIC (position-independent code) is disabled.")
-else()
-  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-endif()
-
-# add the target cuda architectures
-# each additional architecture increases the compilation time and output file size
-if(${CMAKE_VERSION} VERSION_LESS "3.24.0")
-  set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
-else()
-  find_program(_nvidia_smi "nvidia-smi")
-
-  if(_nvidia_smi)
-    set(DETECT_GPU_COUNT_NVIDIA_SMI 0)
-
-    # execute nvidia-smi -L to get a short list of GPUs available
-    exec_program(${_nvidia_smi_path} ARGS -L
-      OUTPUT_VARIABLE _nvidia_smi_out
-      RETURN_VALUE _nvidia_smi_ret)
-
-    # process the stdout of nvidia-smi
-    if(_nvidia_smi_ret EQUAL 0)
-      # convert string with newlines to list of strings
-      string(REGEX REPLACE "\n" ";" _nvidia_smi_out "${_nvidia_smi_out}")
-
-      foreach(_line ${_nvidia_smi_out})
-        if(_line MATCHES "^GPU [0-9]+:")
-          math(EXPR DETECT_GPU_COUNT_NVIDIA_SMI "${DETECT_GPU_COUNT_NVIDIA_SMI}+1")
-
-          # the UUID is not very useful for the user, remove it
-          string(REGEX REPLACE " \\(UUID:.*\\)" "" _gpu_info "${_line}")
-
-          if(NOT _gpu_info STREQUAL "")
-            list(APPEND DETECT_GPU_INFO "${_gpu_info}")
-          endif()
-        endif()
-      endforeach()
-
-      check_num_gpu_info(${DETECT_GPU_COUNT_NVIDIA_SMI} DETECT_GPU_INFO)
-      set(DETECT_GPU_COUNT ${DETECT_GPU_COUNT_NVIDIA_SMI})
-    endif()
-  endif()
-
-  # ##
-  if(DETECT_GPU_COUNT GREATER 0)
-    set(CMAKE_CUDA_ARCHITECTURES native) # do native
-  else()
-    # no GPUs found, like on Github CI runners
-    set(CMAKE_CUDA_ARCHITECTURES 50) # some safe value
-  endif()
-endif()
-
 project(icicle LANGUAGES CUDA CXX)
-# Check CUDA version and, if possible, enable multi-threaded compilation 
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.2")
-    message(STATUS "Using multi-threaded CUDA compilation.")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --split-compile 0")
-else()
-    message(STATUS "Can't use multi-threaded CUDA compilation.")
-endif()
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -lineinfo")
-include_directories("${CMAKE_SOURCE_DIR}")

+include(cmake/Common.cmake)
+include(cmake/FieldsCommon.cmake)
+include(cmake/CurvesCommon.cmake)

-# when adding a new curve/field, append its name to the end of this list
-set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;grumpkin)
-set(SUPPORTED_CURVES_WITH_POSEIDON bn254;bls12_381;bls12_377;bw6_761;grumpkin)
-SET(SUPPORTED_CURVES_WITHOUT_NTT grumpkin)
+set_env()
+set_gpu_env()

-set(IS_CURVE_SUPPORTED FALSE)
-set(I 0)
-foreach (SUPPORTED_CURVE ${SUPPORTED_CURVES})
-  math(EXPR I "${I} + 1")
-  if (CURVE STREQUAL SUPPORTED_CURVE)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DCURVE_ID=${I}")
-    set(IS_CURVE_SUPPORTED TRUE)
-  endif ()
-endforeach()
+option(DEVMODE "Enable development mode" OFF)
+option(EXT_FIELD "Build extension field" OFF)
+option(G2 "Build G2" OFF)
+option(ECNTT "Build ECNTT" OFF)
+option(BUILD_HASH "Build hash functions" OFF)
+option(BUILD_TESTS "Build unit tests" OFF)
+option(BUILD_BENCHMARKS "Build benchmarks" OFF)
+# add options here

-if (NOT IS_CURVE_SUPPORTED)
-  message( FATAL_ERROR "The value of CURVE variable: ${CURVE} is not one of the supported curves: ${SUPPORTED_CURVES}" )
+if((DEFINED CURVE) AND (DEFINED FIELD))
+  message( FATAL_ERROR "CURVE and FIELD cannot be defined at the same time" )
 endif ()

-if (DEVMODE STREQUAL "ON")
+if (DEVMODE)
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O0 --ptxas-options=-O0 --ptxas-options=-allow-expensive-optimizations=false -DDEVMODE=ON")
 endif ()

-if (G2_DEFINED STREQUAL "ON")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2_DEFINED=ON")
+if(DEFINED FIELD)
+  check_field()
+  add_subdirectory(src/fields)
 endif ()

-if (ECNTT_DEFINED STREQUAL "ON")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DECNTT_DEFINED=ON")
+if(DEFINED CURVE)
+  check_curve()
+  set(FIELD ${CURVE})
+  add_subdirectory(src/fields)
+  add_subdirectory(src/curves)
 endif ()

-option(BUILD_TESTS "Build tests" OFF)
-
-if (NOT BUILD_TESTS)
-
-  message(STATUS "Building without tests.")
-
-  if (CURVE IN_LIST SUPPORTED_CURVES_WITH_POSEIDON)
-    list(APPEND ICICLE_SOURCES appUtils/poseidon/poseidon.cu)
-    list(APPEND ICICLE_SOURCES appUtils/tree/merkle.cu)
-  endif()
-
-  if (NOT CURVE IN_LIST SUPPORTED_CURVES_WITHOUT_NTT)
-      list(APPEND ICICLE_SOURCES appUtils/ntt/ntt.cu)
-      list(APPEND ICICLE_SOURCES appUtils/ntt/kernel_ntt.cu)
-      if(ECNTT_DEFINED STREQUAL "ON")
-        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DECNTT_DEFINED=ON")
-      endif()
-  endif()
-
-  add_library(
-    icicle
-    utils/vec_ops.cu
-    utils/mont.cu
-    primitives/field.cu
-    primitives/projective.cu
-    appUtils/msm/msm.cu
-    ${ICICLE_SOURCES}
-  )
-  set_target_properties(icicle PROPERTIES OUTPUT_NAME "ingo_${CURVE}")
-  target_compile_definitions(icicle PRIVATE CURVE=${CURVE})  
-
-else()
-
-  message(STATUS "Building tests.")
-
-  include(FetchContent)
-  FetchContent_Declare(
-    googletest
-    URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip
-  )
-  # For Windows: Prevent overriding the parent project's compiler/linker settings
-
-  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-  FetchContent_MakeAvailable(googletest)
-
-  enable_testing()
-
-  add_executable(
-    runner
-    tests/runner.cu
-  )
-
-  target_link_libraries(
-    runner
-    GTest::gtest_main
-  )
-
-  include(GoogleTest)
-  set_target_properties(runner PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
-  gtest_discover_tests(runner)
-
+if (G2)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2")
 endif ()
+
+if (EXT_FIELD)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DEXT_FIELD")
+endif ()
+
+if(BUILD_HASH)
+  add_subdirectory(src/hash)
+endif ()
+
+if (BUILD_TESTS)
+  add_subdirectory(tests)
+endif()
+
+if (BUILD_BENCHMARKS)
+  add_subdirectory(benchmarks)
+endif()
--- a/icicle/benchmarks/CMakeLists.txt
+++ b/icicle/benchmarks/CMakeLists.txt
@@ -0,0 +1,5 @@
+
+add_executable(benches benches.cu)
+target_link_libraries(benches benchmark::benchmark)
+target_include_directories(benches PUBLIC ${CMAKE_SOURCE_DIR}/include/)
+find_package(benchmark REQUIRED)
--- a/icicle/benchmarks/README.md
+++ b/icicle/benchmarks/README.md
@@ -0,0 +1,25 @@
+# How to use benchmarks
+
+ICICLE uses [google benchmarks](https://github.com/google/benchmark) to measure the performance of primitives.
+
+To run benchmarks, make sure you have everything installed to run ICICLE (see top-level README for that). Next, you need to install google benchmarks library as described in their [documentation](https://github.com/google/benchmark?tab=readme-ov-file#installation). When running benchmarks, export the path to this installation:
+
+```
+export CMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH:<path-to-google-benchmarks-build-folder>
+```
+
+Then to benchmark field arithmetic, say, on `baby_bear` field, run:
+
+```
+cmake -UCURVE -UFIELD -UG2 -UEXT_FIELD -DFIELD=babybear -DEXT_FIELD=ON -S . -B build;
+cmake --build build;
+build/benches --benchmark_counters_tabular=true
+```
+
+`-U` parameters are needed to clear variables from previous runs and `EXT_FIELD` can be disabled if benhcmarking the extension field is not needed. To benchmark a curve, say, `bn254`, change the first `cmake` call to:
+
+```
+cmake -UCURVE -UFIELD -UG2 -UEXT_FIELD -DCURVE=bn254 -S . -B build;
+```
+
+Benchmarks measure throughput of very cheap operations like field multiplication or EC addition by repeating them very many times in parallel, so throughput is the main metric to look at.
--- a/icicle/benchmarks/benches.cu
+++ b/icicle/benchmarks/benches.cu
@@ -0,0 +1,6 @@
+#include "field_benchmarks.cu"
+#ifdef CURVE_ID
+#include "curve_benchmarks.cu"
+#endif
+
+BENCHMARK_MAIN();
--- a/icicle/benchmarks/curve_benchmarks.cu
+++ b/icicle/benchmarks/curve_benchmarks.cu
@@ -0,0 +1,79 @@
+#include <benchmark/benchmark.h>
+#include "utils/test_functions.cuh"
+#include "curves/curve_config.cuh"
+
+using namespace curve_config;
+using namespace benchmark;
+
+static void BM_MixedECAdd(State& state)
+{
+  constexpr int N = 128;
+  int n = state.range(0) / N;
+  projective_t* points1;
+  affine_t* points2;
+  assert(!cudaMalloc(&points1, n * sizeof(projective_t)));
+  assert(!cudaMalloc(&points2, n * sizeof(affine_t)));
+
+  projective_t* h_points1 = (projective_t*)malloc(n * sizeof(projective_t));
+  affine_t* h_points2 = (affine_t*)malloc(n * sizeof(affine_t));
+  projective_t::RandHostMany(h_points1, n);
+  projective_t::RandHostManyAffine(h_points2, n);
+  cudaMemcpy(points1, h_points1, sizeof(projective_t) * n, cudaMemcpyHostToDevice);
+  cudaMemcpy(points2, h_points2, sizeof(affine_t) * n, cudaMemcpyHostToDevice);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((vec_add<projective_t, affine_t, N>(points1, points2, points1, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(points1);
+  cudaFree(points2);
+}
+
+static void BM_FullECAdd(benchmark::State& state)
+{
+  constexpr int N = 128;
+  int n = state.range(0) / N;
+  projective_t* points1;
+  projective_t* points2;
+  assert(!cudaMalloc(&points1, n * sizeof(projective_t)));
+  assert(!cudaMalloc(&points2, n * sizeof(projective_t)));
+
+  projective_t* h_points1 = (projective_t*)malloc(n * sizeof(projective_t));
+  projective_t* h_points2 = (projective_t*)malloc(n * sizeof(projective_t));
+  projective_t::RandHostMany(h_points1, n);
+  projective_t::RandHostMany(h_points2, n);
+  cudaMemcpy(points1, h_points1, sizeof(projective_t) * n, cudaMemcpyHostToDevice);
+  cudaMemcpy(points2, h_points2, sizeof(projective_t) * n, cudaMemcpyHostToDevice);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((vec_add<projective_t, projective_t, N>(points1, points2, points1, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(points1);
+  cudaFree(points2);
+}
+
+BENCHMARK(BM_FullECAdd)->Range(1 << 27, 1 << 27)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_MixedECAdd)->Range(1 << 27, 1 << 27)->Unit(benchmark::kMillisecond);
--- a/icicle/benchmarks/field_benchmarks.cu
+++ b/icicle/benchmarks/field_benchmarks.cu
@@ -0,0 +1,108 @@
+#include <benchmark/benchmark.h>
+#include "utils/test_functions.cuh"
+#include "fields/field_config.cuh"
+
+using namespace field_config;
+using namespace benchmark;
+
+template <class T>
+static void BM_FieldAdd(State& state)
+{
+  constexpr int N = 256;
+  int n = state.range(0) / N;
+  T* scalars1;
+  T* scalars2;
+  assert(!cudaMalloc(&scalars1, n * sizeof(T)));
+  assert(!cudaMalloc(&scalars2, n * sizeof(T)));
+
+  assert(device_populate_random<T>(scalars1, n) == cudaSuccess);
+  assert(device_populate_random<T>(scalars2, n) == cudaSuccess);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((vec_add<T, T, N>(scalars1, scalars2, scalars1, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(scalars1);
+  cudaFree(scalars2);
+}
+
+template <class T>
+static void BM_FieldMul(State& state)
+{
+  constexpr int N = 128;
+  int n = state.range(0) / N;
+  T* scalars1;
+  T* scalars2;
+  assert(!cudaMalloc(&scalars1, n * sizeof(T)));
+  assert(!cudaMalloc(&scalars2, n * sizeof(T)));
+
+  assert(device_populate_random<T>(scalars1, n) == cudaSuccess);
+  assert(device_populate_random<T>(scalars2, n) == cudaSuccess);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((vec_mul<T, T, N>(scalars1, scalars2, scalars1, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(scalars1);
+  cudaFree(scalars2);
+}
+
+template <class T>
+static void BM_FieldSqr(State& state)
+{
+  constexpr int N = 128;
+  int n = state.range(0) / N;
+  T* scalars;
+  assert(!cudaMalloc(&scalars, n * sizeof(T)));
+
+  assert(device_populate_random<T>(scalars, n) == cudaSuccess);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((field_vec_sqr<T, N>(scalars, scalars, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(scalars);
+}
+
+BENCHMARK(BM_FieldAdd<scalar_t>)->Range(1 << 28, 1 << 28)->Unit(kMicrosecond);
+BENCHMARK(BM_FieldMul<scalar_t>)->Range(1 << 27, 1 << 27)->Unit(kMicrosecond);
+BENCHMARK(BM_FieldSqr<scalar_t>)->Range(1 << 27, 1 << 27)->Unit(kMicrosecond);
+
+#ifdef EXT_FIELD
+BENCHMARK(BM_FieldAdd<extension_t>)->Range(1 << 28, 1 << 28)->Unit(kMicrosecond);
+BENCHMARK(BM_FieldMul<extension_t>)->Range(1 << 27, 1 << 27)->Unit(kMicrosecond);
+BENCHMARK(BM_FieldSqr<extension_t>)->Range(1 << 27, 1 << 27)->Unit(kMicrosecond);
+#endif
--- a/icicle/cmake/Common.cmake
+++ b/icicle/cmake/Common.cmake
@@ -0,0 +1,72 @@
+function(set_env)
+    set(CMAKE_CXX_STANDARD 17 PARENT_SCOPE)
+    set(CMAKE_CUDA_STANDARD 17 PARENT_SCOPE)
+    set(CMAKE_CUDA_STANDARD_REQUIRED TRUE PARENT_SCOPE)
+    set(CMAKE_CXX_STANDARD_REQUIRED TRUE PARENT_SCOPE)
+
+    if("$ENV{ICICLE_PIC}" STREQUAL "OFF" OR ICICLE_PIC STREQUAL "OFF")
+        message(WARNING "Note that PIC (position-independent code) is disabled.")
+    else()
+        set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+    endif()
+endfunction()
+
+function(set_gpu_env)
+    # add the target cuda architectures
+    # each additional architecture increases the compilation time and output file size
+    if(${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH} PARENT_SCOPE)
+    else()
+    find_program(_nvidia_smi "nvidia-smi")
+
+    if(_nvidia_smi)
+        set(DETECT_GPU_COUNT_NVIDIA_SMI 0)
+
+        # execute nvidia-smi -L to get a short list of GPUs available
+        exec_program(${_nvidia_smi_path} ARGS -L
+        OUTPUT_VARIABLE _nvidia_smi_out
+        RETURN_VALUE _nvidia_smi_ret)
+
+        # process the stdout of nvidia-smi
+        if(_nvidia_smi_ret EQUAL 0)
+        # convert string with newlines to list of strings
+        string(REGEX REPLACE "\n" ";" _nvidia_smi_out "${_nvidia_smi_out}")
+
+        foreach(_line ${_nvidia_smi_out})
+            if(_line MATCHES "^GPU [0-9]+:")
+            math(EXPR DETECT_GPU_COUNT_NVIDIA_SMI "${DETECT_GPU_COUNT_NVIDIA_SMI}+1")
+
+            # the UUID is not very useful for the user, remove it
+            string(REGEX REPLACE " \\(UUID:.*\\)" "" _gpu_info "${_line}")
+
+            if(NOT _gpu_info STREQUAL "")
+                list(APPEND DETECT_GPU_INFO "${_gpu_info}")
+            endif()
+            endif()
+        endforeach()
+
+        check_num_gpu_info(${DETECT_GPU_COUNT_NVIDIA_SMI} DETECT_GPU_INFO)
+        set(DETECT_GPU_COUNT ${DETECT_GPU_COUNT_NVIDIA_SMI})
+        endif()
+    endif()
+
+    # ##
+    if(DETECT_GPU_COUNT GREATER 0)
+        set(CMAKE_CUDA_ARCHITECTURES native PARENT_SCOPE) # do native
+    else()
+        # no GPUs found, like on Github CI runners
+        set(CMAKE_CUDA_ARCHITECTURES 50 PARENT_SCOPE) # some safe value
+    endif()
+    endif()
+
+    # Check CUDA version and, if possible, enable multi-threaded compilation 
+    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.2")
+        message(STATUS "Using multi-threaded CUDA compilation.")
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --split-compile 0" PARENT_SCOPE)
+    else()
+        message(STATUS "Can't use multi-threaded CUDA compilation.")
+    endif()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr" PARENT_SCOPE)
+    set(CMAKE_CUDA_FLAGS_RELEASE "" PARENT_SCOPE)
+    set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -lineinfo" PARENT_SCOPE)
+endfunction()
--- a/icicle/cmake/CurvesCommon.cmake
+++ b/icicle/cmake/CurvesCommon.cmake
@@ -0,0 +1,17 @@
+function(check_curve)
+  set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;grumpkin)
+
+  set(IS_CURVE_SUPPORTED FALSE)
+  set(I 0)
+  foreach (SUPPORTED_CURVE ${SUPPORTED_CURVES})
+    math(EXPR I "${I} + 1")
+    if (CURVE STREQUAL SUPPORTED_CURVE)
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DCURVE_ID=${I} -DFIELD_ID=${I}" PARENT_SCOPE)
+      set(IS_CURVE_SUPPORTED TRUE)
+    endif ()
+  endforeach()
+
+  if (NOT IS_CURVE_SUPPORTED)
+    message( FATAL_ERROR "The value of CURVE variable: ${CURVE} is not one of the supported curves: ${SUPPORTED_CURVES}" )
+  endif ()
+endfunction()
--- a/icicle/cmake/FieldsCommon.cmake
+++ b/icicle/cmake/FieldsCommon.cmake
@@ -0,0 +1,17 @@
+function(check_field)
+  set(SUPPORTED_FIELDS babybear)
+
+  set(IS_FIELD_SUPPORTED FALSE)
+  set(I 1000)
+  foreach (SUPPORTED_FIELD ${SUPPORTED_FIELDS})
+    math(EXPR I "${I} + 1")
+    if (FIELD STREQUAL SUPPORTED_FIELD)
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DFIELD_ID=${I}" PARENT_SCOPE)
+      set(IS_FIELD_SUPPORTED TRUE)
+    endif ()
+  endforeach()
+
+  if (NOT IS_FIELD_SUPPORTED)
+    message( FATAL_ERROR "The value of FIELD variable: ${FIELD} is not one of the supported fields: ${SUPPORTED_FIELDS}" )
+  endif ()
+endfunction()
--- a/icicle/include/curves/affine.cuh
+++ b/icicle/include/curves/affine.cuh
@@ -1,6 +1,8 @@
 #pragma once

-#include "field.cuh"
+#include "gpu-utils/sharedmem.cuh"
+#include "gpu-utils/modifiers.cuh"
+#include <iostream>

 template <class FF>
 class Affine
@@ -33,4 +35,13 @@ public:
    os << "x: " << point.x << "; y: " << point.y;
    return os;
  }
-};
+};
+
+template <class FF>
+struct SharedMemory<Affine<FF>> {
+  __device__ Affine<FF>* getPointer()
+  {
+    extern __shared__ Affine<FF> s_affine_[];
+    return s_affine_;
+  }
+};
--- a/icicle/include/curves/curve_config.cuh
+++ b/icicle/include/curves/curve_config.cuh
@@ -1,36 +1,38 @@
 #pragma once
-#ifndef INDEX_H
-#define INDEX_H
+#ifndef CURVE_CONFIG_H
+#define CURVE_CONFIG_H

-#define BN254     1
-#define BLS12_381 2
-#define BLS12_377 3
-#define BW6_761   4
-#define GRUMPKIN  5
-
-#include "primitives/field.cuh"
-#include "primitives/projective.cuh"
-#if defined(G2_DEFINED)
-#include "primitives/extension_field.cuh"
-#endif
+#include "fields/id.h"
+#include "curves/projective.cuh"

 #if CURVE_ID == BN254
-#include "bn254_params.cuh"
+#include "curves/params/bn254.cuh"
 using namespace bn254;
+
 #elif CURVE_ID == BLS12_381
-#include "bls12_381_params.cuh"
+#include "curves/params/bls12_381.cuh"
 using namespace bls12_381;
+
 #elif CURVE_ID == BLS12_377
-#include "bls12_377_params.cuh"
+#include "curves/params/bls12_377.cuh"
 using namespace bls12_377;
+
 #elif CURVE_ID == BW6_761
-#include "bw6_761_params.cuh"
+#include "curves/params/bw6_761.cuh"
 using namespace bw6_761;
+
 #elif CURVE_ID == GRUMPKIN
-#include "grumpkin_params.cuh"
+#include "curves/params/grumpkin.cuh"
 using namespace grumpkin;
 #endif

+#include "fields/field_config.cuh"
+using field_config::scalar_t;
+
+#ifdef G2
+#include "fields/quadratic_extension.cuh"
+#endif
+
 /**
 * @namespace curve_config
 * Namespace with type definitions for short Weierstrass pairing-friendly [elliptic
@@ -38,14 +40,11 @@ using namespace grumpkin;
 * with the `-DCURVE` env variable passed during build.
 */
 namespace curve_config {
-  /**
-   * Scalar field of the curve. Is always a prime field.
-   */
-  typedef Field<fp_config> scalar_t;
  /**
   * Base field of G1 curve. Is always a prime field.
   */
  typedef Field<fq_config> point_field_t;
+
  static constexpr point_field_t generator_x = point_field_t{g1_gen_x};
  static constexpr point_field_t generator_y = point_field_t{g1_gen_y};
  static constexpr point_field_t b = point_field_t{weierstrass_b};
@@ -59,7 +58,7 @@ namespace curve_config {
   */
  typedef Affine<point_field_t> affine_t;

-#if defined(G2_DEFINED)
+#ifdef G2
 #if CURVE_ID == BW6_761
  typedef point_field_t g2_point_field_t;
  static constexpr g2_point_field_t g2_generator_x = g2_point_field_t{g2_gen_x};
@@ -74,6 +73,7 @@ namespace curve_config {
  static constexpr g2_point_field_t g2_b =
    g2_point_field_t{point_field_t{weierstrass_b_g2_re}, point_field_t{weierstrass_b_g2_im}};
 #endif
+
  /**
   * [Projective representation](https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html) of G2 curve.
   */
--- a/icicle/include/curves/params/bls12_377.cuh
+++ b/icicle/include/curves/params/bls12_377.cuh
@@ -0,0 +1,40 @@
+#pragma once
+#ifndef BLS12_377_PARAMS_H
+#define BLS12_377_PARAMS_H
+
+#include "fields/storage.cuh"
+#include "fields/snark_fields/bls12_377_base.cuh"
+
+namespace bls12_377 {
+  // G1 and G2 generators
+  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0xb21be9ef, 0xeab9b16e, 0xffcd394e, 0xd5481512,
+                                                               0xbd37cb5c, 0x188282c8, 0xaa9d41bb, 0x85951e2c,
+                                                               0xbf87ff54, 0xc8fc6225, 0xfe740a67, 0x008848de};
+  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x559c8ea6, 0xfd82de55, 0x34a9591a, 0xc2fe3d36,
+                                                               0x4fb82305, 0x6d182ad4, 0xca3e52d9, 0xbd7fb348,
+                                                               0x30afeec4, 0x1f674f5d, 0xc5102eff, 0x01914a69};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0x7c005196, 0x74e3e48f, 0xbb535402, 0x71889f52,
+                                                                  0x57db6b9b, 0x7ea501f5, 0x203e5031, 0xc565f071,
+                                                                  0xa3841d01, 0xc89630a2, 0x71c785fe, 0x018480be};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0x6ea16afe, 0xb26bfefa, 0xbff76fe6, 0x5cf89984,
+                                                                  0x0799c9de, 0xe7223ece, 0x6651cecb, 0x532777ee,
+                                                                  0xb1b140d5, 0x70dc5a51, 0xe7004031, 0x00ea6040};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x09fd4ddf, 0xf0940944, 0x6d8c7c2e, 0xf2cf8888,
+                                                                  0xf832d204, 0xe458c282, 0x74b49a58, 0xde03ed72,
+                                                                  0xcbb2efb4, 0xd960736b, 0x5d446f7b, 0x00690d66};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0x85eb8f93, 0xd9a1cdd1, 0x5e52270b, 0x4279b83f,
+                                                                  0xcee304c2, 0x2463b01a, 0x3d591bf1, 0x61ef11ac,
+                                                                  0x151a70aa, 0x9e549da3, 0xd2835518, 0x00f8169f};
+
+  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
+    0x9999999a, 0x1c9ed999, 0x1ccccccd, 0x0dd39e5c, 0x3c6bf800, 0x129207b6,
+    0xcd5fd889, 0xdc7b4f91, 0x7460c589, 0x43bd0373, 0xdb0fd6f3, 0x010222f6};
+} // namespace bls12_377
+
+#endif
--- a/icicle/include/curves/params/bls12_381.cuh
+++ b/icicle/include/curves/params/bls12_381.cuh
@@ -0,0 +1,40 @@
+#pragma once
+#ifndef BLS12_381_PARAMS_H
+#define BLS12_381_PARAMS_H
+
+#include "fields/storage.cuh"
+#include "fields/snark_fields/bls12_381_base.cuh"
+
+namespace bls12_381 {
+  // G1 and G2 generators
+  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0xdb22c6bb, 0xfb3af00a, 0xf97a1aef, 0x6c55e83f,
+                                                               0x171bac58, 0xa14e3a3f, 0x9774b905, 0xc3688c4f,
+                                                               0x4fa9ac0f, 0x2695638c, 0x3197d794, 0x17f1d3a7};
+  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x46c5e7e1, 0x0caa2329, 0xa2888ae4, 0xd03cc744,
+                                                               0x2c04b3ed, 0x00db18cb, 0xd5d00af6, 0xfcf5e095,
+                                                               0x741d8ae4, 0xa09e30ed, 0xe3aaa0f1, 0x08b3f481};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0xc121bdb8, 0xd48056c8, 0xa805bbef, 0x0bac0326,
+                                                                  0x7ae3d177, 0xb4510b64, 0xfa403b02, 0xc6e47ad4,
+                                                                  0x2dc51051, 0x26080527, 0xf08f0a91, 0x024aa2b2};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0x5d042b7e, 0xe5ac7d05, 0x13945d57, 0x334cf112,
+                                                                  0xdc7f5049, 0xb5da61bb, 0x9920b61a, 0x596bd0d0,
+                                                                  0x88274f65, 0x7dacd3a0, 0x52719f60, 0x13e02b60};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x08b82801, 0xe1935486, 0x3baca289, 0x923ac9cc,
+                                                                  0x5160d12c, 0x6d429a69, 0x8cbdd3a7, 0xadfd9baa,
+                                                                  0xda2e351a, 0x8cc9cdc6, 0x727d6e11, 0x0ce5d527};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0xf05f79be, 0xaaa9075f, 0x5cec1da1, 0x3f370d27,
+                                                                  0x572e99ab, 0x267492ab, 0x85a763af, 0xcb3e287e,
+                                                                  0x2bc28b99, 0x32acd2b0, 0x2ea734cc, 0x0606c4a0};
+
+  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000004, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
+    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
+    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+} // namespace bls12_381
+
+#endif
--- a/icicle/include/curves/params/bn254.cuh
+++ b/icicle/include/curves/params/bn254.cuh
@@ -0,0 +1,31 @@
+#pragma once
+#ifndef BN254_PARAMS_H
+#define BN254_PARAMS_H
+
+#include "fields/storage.cuh"
+#include "fields/snark_fields/bn254_base.cuh"
+
+namespace bn254 {
+  // G1 and G2 generators
+  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                               0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x00000002, 0x00000000, 0x00000000, 0x00000000,
+                                                               0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0xd992f6ed, 0x46debd5c, 0xf75edadd, 0x674322d4,
+                                                                  0x5e5c4479, 0x426a0066, 0x121f1e76, 0x1800deef};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0xaef312c2, 0x97e485b7, 0x35a9e712, 0xf1aa4933,
+                                                                  0x31fb5d25, 0x7260bfb7, 0x920d483a, 0x198e9393};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x66fa7daa, 0x4ce6cc01, 0x0c43d37b, 0xe3d1e769,
+                                                                  0x8dcb408f, 0x4aab7180, 0xdb8c6deb, 0x12c85ea5};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0xd122975b, 0x55acdadc, 0x70b38ef3, 0xbc4b3133,
+                                                                  0x690c3395, 0xec9e99ad, 0x585ff075, 0x090689d0};
+
+  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000003, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
+    0x24a138e5, 0x3267e6dc, 0x59dbefa3, 0xb5b4c5e5, 0x1be06ac3, 0x81be1899, 0xceb8aaae, 0x2b149d40};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
+    0x85c315d2, 0xe4a2bd06, 0xe52d1852, 0xa74fa084, 0xeed8fdf4, 0xcd2cafad, 0x3af0fed4, 0x009713b0};
+} // namespace bn254
+
+#endif
--- a/icicle/include/curves/params/bw6_761.cuh
+++ b/icicle/include/curves/params/bw6_761.cuh
@@ -0,0 +1,37 @@
+#pragma once
+#ifndef BW6_761_PARAMS_H
+#define BW6_761_PARAMS_H
+
+#include "fields/storage.cuh"
+#include "fields/snark_fields/bw6_761_base.cuh"
+
+namespace bw6_761 {
+  // G1 and G2 generators
+  static constexpr storage<fq_config::limbs_count> g1_gen_x = {
+    0x66e5b43d, 0x4088f3af, 0xa6af603f, 0x055928ac, 0x56133e82, 0x6750dd03, 0x280ca27f, 0x03758f9a,
+    0xc9ea0971, 0x5bd71fa0, 0x47729b90, 0xa17a54ce, 0x94c2e746, 0x11dbfcd2, 0xc15520ac, 0x79017ffa,
+    0x85f56fc7, 0xee05c54b, 0x551b27f0, 0xe6a0cfb7, 0xa477beae, 0xb277ce98, 0x0ea190c8, 0x01075b02};
+  static constexpr storage<fq_config::limbs_count> g1_gen_y = {
+    0xb4e95363, 0xbafc8f2d, 0x0b20d2a1, 0xad1cb2be, 0xcad0fb93, 0xb2b08119, 0xb3053253, 0x9f9df141,
+    0x6fc2cdd4, 0xbe3fb90b, 0x717a4c55, 0xcc685d31, 0x71b5b806, 0xc5b8fa17, 0xaf7e0dba, 0x265909f1,
+    0xa2e573a3, 0x1a7348d2, 0x884c9ec6, 0x0f952589, 0x45cc2a42, 0xe6fd637b, 0x0a6fc574, 0x0058b84e};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x = {
+    0xcd025f1c, 0xa830c194, 0xe1bf995b, 0x6410cf4f, 0xc2ad54b0, 0x00e96efb, 0x3cd208d7, 0xce6948cb,
+    0x00e1b6ba, 0x963317a3, 0xac70e7c7, 0xc5bbcae9, 0xf09feb58, 0x734ec3f1, 0xab3da268, 0x26b41c5d,
+    0x13890f6d, 0x4c062010, 0xc5a7115f, 0xd61053aa, 0x69d660f9, 0xc852a82e, 0x41d9b816, 0x01101332};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y = {
+    0x28c73b61, 0xeb70a167, 0xf9eac689, 0x91ec0594, 0x3c5a02a5, 0x58aa2d3a, 0x504affc7, 0x3ea96fcd,
+    0xffa82300, 0x8906c170, 0xd2c712b8, 0x64f293db, 0x33293fef, 0x94c97eb7, 0x0b95a59c, 0x0a1d86c8,
+    0x53ffe316, 0x81a78e27, 0xcec2181c, 0x26b7cf9a, 0xe4b6d2dc, 0x8179eb10, 0x7761369f, 0x0017c335};
+
+  static constexpr storage<fq_config::limbs_count> weierstrass_b = {
+    0x0000008a, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2,
+    0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638,
+    0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824};
+  static constexpr storage<fq_config::limbs_count> g2_weierstrass_b = {
+    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+} // namespace bw6_761
+
+#endif
--- a/icicle/include/curves/params/grumpkin.cuh
+++ b/icicle/include/curves/params/grumpkin.cuh
@@ -2,13 +2,11 @@
 #ifndef GRUMPKIN_PARAMS_H
 #define GRUMPKIN_PARAMS_H

-#include "utils/storage.cuh"
-#include "bn254_params.cuh"
+#include "fields/storage.cuh"
+#include "fields/snark_fields/bn254_scalar.cuh"

 namespace grumpkin {
-  typedef bn254::fq_config fp_config;
  typedef bn254::fp_config fq_config;
-
  // G1 generator
  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
                                                               0x00000000, 0x00000000, 0x00000000, 0x00000000};
--- a/icicle/include/curves/projective.cuh
+++ b/icicle/include/curves/projective.cuh
@@ -1,6 +1,7 @@
 #pragma once

 #include "affine.cuh"
+#include "gpu-utils/sharedmem.cuh"

 template <typename FF, class SCALAR_FF, const FF& B_VALUE, const FF& GENERATOR_X, const FF& GENERATOR_Y>
 class Projective
@@ -8,6 +9,9 @@ class Projective
  friend Affine<FF>;

 public:
+  typedef Affine<FF> Aff;
+  typedef SCALAR_FF Scalar;
+
  static constexpr unsigned SCALAR_FF_NBITS = SCALAR_FF::NBITS;
  static constexpr unsigned FF_NBITS = FF::NBITS;

@@ -23,7 +27,10 @@ public:
    return {point.x * denom, point.y * denom};
  }

-  static HOST_DEVICE_INLINE Projective from_affine(const Affine<FF>& point) { return {point.x, point.y, FF::one()}; }
+  static HOST_DEVICE_INLINE Projective from_affine(const Affine<FF>& point)
+  {
+    return point == Affine<FF>::zero() ? zero() : Projective{point.x, point.y, FF::one()};
+  }

  static HOST_DEVICE_INLINE Projective ToMontgomery(const Projective& point)
  {
@@ -221,3 +228,12 @@ public:
      out[i] = (i % size < 100) ? to_affine(rand_host()) : out[i - 100];
  }
 };
+
+template <typename FF, class SCALAR_FF, const FF& B_VALUE, const FF& GENERATOR_X, const FF& GENERATOR_Y>
+struct SharedMemory<Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>> {
+  __device__ Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>* getPointer()
+  {
+    extern __shared__ Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y> s_projective_[];
+    return s_projective_;
+  }
+};
--- a/icicle/include/fields/field.cuh
+++ b/icicle/include/fields/field.cuh
@@ -18,10 +18,13 @@

 #pragma once

-#include "utils/error_handler.cuh"
-#include "utils/host_math.cuh"
-#include "utils/ptx.cuh"
-#include "utils/storage.cuh"
+#include "gpu-utils/error_handler.cuh"
+#include "gpu-utils/modifiers.cuh"
+#include "gpu-utils/sharedmem.cuh"
+#include "host_math.cuh"
+#include "ptx.cuh"
+#include "storage.cuh"
+
 #include <iomanip>
 #include <iostream>
 #include <random>
@@ -244,10 +247,10 @@ public:
  add_sub_u32_device(const uint32_t* x, const uint32_t* y, uint32_t* r, size_t n = (TLC >> 1))
  {
    r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]);
-    for (unsigned i = 1; i < (CARRY_OUT ? n : n - 1); i++)
+    for (unsigned i = 1; i < n; i++)
      r[i] = SUBTRACT ? ptx::subc_cc(x[i], y[i]) : ptx::addc_cc(x[i], y[i]);
    if (!CARRY_OUT) {
-      r[n - 1] = SUBTRACT ? ptx::subc(x[n - 1], y[n - 1]) : ptx::addc(x[n - 1], y[n - 1]);
+      ptx::addc(0, 0);
      return 0;
    }
    return SUBTRACT ? ptx::subc(0, 0) : ptx::addc(0, 0);
@@ -466,67 +469,84 @@ public:
   */
  static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
  {
-    const uint32_t* a = as.limbs;
-    const uint32_t* b = bs.limbs;
-    uint32_t* even = rs.limbs;
-    __align__(16) uint32_t odd[2 * TLC - 2];
+    if constexpr (TLC > 1) {
+      const uint32_t* a = as.limbs;
+      const uint32_t* b = bs.limbs;
+      uint32_t* even = rs.limbs;
+      __align__(16) uint32_t odd[2 * TLC - 2];

-    even[TLC - 1] = ptx::mul_hi(a[TLC - 2], b[0]);
-    odd[TLC - 2] = ptx::mul_lo(a[TLC - 1], b[0]);
-    odd[TLC - 1] = ptx::mul_hi(a[TLC - 1], b[0]);
-    size_t i;
-    UNROLL
-    for (i = 2; i < TLC - 1; i += 2) {
-      mad_row_msb<true>(&even[TLC - 2], &odd[TLC - 2], &a[TLC - i - 1], b[i - 1], i + 1);
-      mad_row_msb<false>(&odd[TLC - 2], &even[TLC - 2], &a[TLC - i - 2], b[i], i + 2);
+      even[TLC - 1] = ptx::mul_hi(a[TLC - 2], b[0]);
+      odd[TLC - 2] = ptx::mul_lo(a[TLC - 1], b[0]);
+      odd[TLC - 1] = ptx::mul_hi(a[TLC - 1], b[0]);
+      size_t i;
+      UNROLL
+      for (i = 2; i < TLC - 1; i += 2) {
+        mad_row_msb<true>(&even[TLC - 2], &odd[TLC - 2], &a[TLC - i - 1], b[i - 1], i + 1);
+        mad_row_msb<false>(&odd[TLC - 2], &even[TLC - 2], &a[TLC - i - 2], b[i], i + 2);
+      }
+      mad_row(&even[TLC], &odd[TLC - 2], a, b[TLC - 1]);
+
+      // merge |even| and |odd|
+      ptx::add_cc(even[TLC - 1], odd[TLC - 2]);
+      for (i = TLC - 1; i < 2 * TLC - 2; i++)
+        even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
+      even[i + 1] = ptx::addc(even[i + 1], 0);
+    } else {
+      multiply_raw_device(as, bs, rs);
    }
-    mad_row(&even[TLC], &odd[TLC - 2], a, b[TLC - 1]);
-
-    // merge |even| and |odd|
-    ptx::add_cc(even[TLC - 1], odd[TLC - 2]);
-    for (i = TLC - 1; i < 2 * TLC - 2; i++)
-      even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
-    even[i + 1] = ptx::addc(even[i + 1], 0);
  }

  /**
-   * A function that computes the low half of the fused multiply-and-add \f$ rs = as \cdot bs + cs \f$.
+   * A function that computes the low half of the fused multiply-and-add \f$ rs = as \cdot bs + cs \f$ where
+   * \f$ bs = 2^{32*nof_limbs} \f$.
   *
   * For efficiency, this method does not include terms that are too large. Namely, limb product \f$ a_i \cdot b_j \f$
   * is excluded if \f$ i + j > TLC - 1 \f$ and only the lower half is included if \f$ i + j = TLC - 1 \f$. All other
   * limb products are included.
   */
  static DEVICE_INLINE void
-  multiply_and_add_lsb_raw_device(const ff_storage& as, const ff_storage& bs, ff_storage& cs, ff_storage& rs)
+  multiply_and_add_lsb_neg_modulus_raw_device(const ff_storage& as, ff_storage& cs, ff_storage& rs)
  {
+    ff_storage bs = get_neg_modulus();
    const uint32_t* a = as.limbs;
    const uint32_t* b = bs.limbs;
+    uint32_t* c = cs.limbs;
    uint32_t* even = rs.limbs;
-    __align__(16) uint32_t odd[TLC - 1];
-    size_t i;
-    // `b[0]` is \f$ 2^{32} \f$ minus the last limb of prime modulus. Because most scalar (and some base) primes
-    // are necessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to
-    // less efficient SASS generated by nvcc, so this case needed separate handling.
-    if (b[0] == UINT32_MAX) {
-      add_sub_u32_device<true, false>(cs.limbs, a, even, TLC);
-      for (i = 0; i < TLC - 1; i++)
-        odd[i] = a[i];
-    } else {
-      mul_n_and_add(even, a, b[0], cs.limbs, TLC);
-      mul_n(odd, a + 1, b[0], TLC - 1);
-    }
-    mad_row_lsb(&even[2], &odd[0], a, b[1], TLC - 1);
-    UNROLL
-    for (i = 2; i < TLC - 1; i += 2) {
-      mad_row_lsb(&odd[i], &even[i], a, b[i], TLC - i);
-      mad_row_lsb(&even[i + 2], &odd[i], a, b[i + 1], TLC - i - 1);
-    }

-    // merge |even| and |odd|
-    even[1] = ptx::add_cc(even[1], odd[0]);
-    for (i = 1; i < TLC - 2; i++)
-      even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
-    even[i + 1] = ptx::addc(even[i + 1], odd[i]);
+    if constexpr (TLC > 2) {
+      __align__(16) uint32_t odd[TLC - 1];
+      size_t i;
+      // `b[0]` is \f$ 2^{32} \f$ minus the last limb of prime modulus. Because most scalar (and some base) primes
+      // are necessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to
+      // less efficient SASS generated by nvcc, so this case needed separate handling.
+      if (b[0] == UINT32_MAX) {
+        add_sub_u32_device<true, false>(c, a, even, TLC);
+        for (i = 0; i < TLC - 1; i++)
+          odd[i] = a[i];
+      } else {
+        mul_n_and_add(even, a, b[0], c, TLC);
+        mul_n(odd, a + 1, b[0], TLC - 1);
+      }
+      mad_row_lsb(&even[2], &odd[0], a, b[1], TLC - 1);
+      UNROLL
+      for (i = 2; i < TLC - 1; i += 2) {
+        mad_row_lsb(&odd[i], &even[i], a, b[i], TLC - i);
+        mad_row_lsb(&even[i + 2], &odd[i], a, b[i + 1], TLC - i - 1);
+      }
+
+      // merge |even| and |odd|
+      even[1] = ptx::add_cc(even[1], odd[0]);
+      for (i = 1; i < TLC - 2; i++)
+        even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
+      even[i + 1] = ptx::addc(even[i + 1], odd[i]);
+    } else if (TLC == 2) {
+      even[0] = ptx::mad_lo(a[0], b[0], c[0]);
+      even[1] = ptx::mad_hi(a[0], b[0], c[0]);
+      even[1] = ptx::mad_lo(a[0], b[1], even[1]);
+      even[1] = ptx::mad_lo(a[1], b[0], even[1]);
+    } else if (TLC == 1) {
+      even[0] = ptx::mad_lo(a[0], b[0], c[0]);
+    }
  }

  /**
@@ -599,29 +619,47 @@ public:
    const uint32_t* a = as.limbs;
    const uint32_t* b = bs.limbs;
    uint32_t* r = rs.limbs;
-    // Next two lines multiply high and low halves of operands (\f$ a_{lo} \cdot b_{lo}; a_{hi} \cdot b_{hi} \$f) and
-    // write the results into `r`.
-    multiply_short_raw_device(a, b, r);
-    multiply_short_raw_device(&a[TLC >> 1], &b[TLC >> 1], &r[TLC]);
-    __align__(16) uint32_t middle_part[TLC];
-    __align__(16) uint32_t diffs[TLC];
-    // Differences of halves \f$ a_{hi} - a_{lo}; b_{lo} - b_{hi} \$f are written into `diffs`, signs written to
-    // `carry1` and `carry2`.
-    uint32_t carry1 = add_sub_u32_device<true, true>(&a[TLC >> 1], a, diffs);
-    uint32_t carry2 = add_sub_u32_device<true, true>(b, &b[TLC >> 1], &diffs[TLC >> 1]);
-    // Compute the "middle part" of Karatsuba: \f$ a_{lo} \cdot b_{hi} + b_{lo} \cdot a_{hi} \f$.
-    // This is where the assumption about unset high bit of `a` and `b` is relevant.
-    multiply_and_add_short_raw_device(diffs, &diffs[TLC >> 1], middle_part, r, &r[TLC]);
-    // Corrections that need to be performed when differences are negative.
-    // Again, carry doesn't need to be propagated due to unset high bits of `a` and `b`.
-    if (carry1) add_sub_u32_device<true, false>(&middle_part[TLC >> 1], &diffs[TLC >> 1], &middle_part[TLC >> 1]);
-    if (carry2) add_sub_u32_device<true, false>(&middle_part[TLC >> 1], diffs, &middle_part[TLC >> 1]);
-    // Now that middle part is fully correct, it can be added to the result.
-    add_sub_u32_device<false, true>(&r[TLC >> 1], middle_part, &r[TLC >> 1], TLC);
+    if constexpr (TLC > 2) {
+      // Next two lines multiply high and low halves of operands (\f$ a_{lo} \cdot b_{lo}; a_{hi} \cdot b_{hi} \$f) and
+      // write the results into `r`.
+      multiply_short_raw_device(a, b, r);
+      multiply_short_raw_device(&a[TLC >> 1], &b[TLC >> 1], &r[TLC]);
+      __align__(16) uint32_t middle_part[TLC];
+      __align__(16) uint32_t diffs[TLC];
+      // Differences of halves \f$ a_{hi} - a_{lo}; b_{lo} - b_{hi} \$f are written into `diffs`, signs written to
+      // `carry1` and `carry2`.
+      uint32_t carry1 = add_sub_u32_device<true, true>(&a[TLC >> 1], a, diffs);
+      uint32_t carry2 = add_sub_u32_device<true, true>(b, &b[TLC >> 1], &diffs[TLC >> 1]);
+      // Compute the "middle part" of Karatsuba: \f$ a_{lo} \cdot b_{hi} + b_{lo} \cdot a_{hi} \f$.
+      // This is where the assumption about unset high bit of `a` and `b` is relevant.
+      multiply_and_add_short_raw_device(diffs, &diffs[TLC >> 1], middle_part, r, &r[TLC]);
+      // Corrections that need to be performed when differences are negative.
+      // Again, carry doesn't need to be propagated due to unset high bits of `a` and `b`.
+      if (carry1) add_sub_u32_device<true, false>(&middle_part[TLC >> 1], &diffs[TLC >> 1], &middle_part[TLC >> 1]);
+      if (carry2) add_sub_u32_device<true, false>(&middle_part[TLC >> 1], diffs, &middle_part[TLC >> 1]);
+      // Now that middle part is fully correct, it can be added to the result.
+      add_sub_u32_device<false, true>(&r[TLC >> 1], middle_part, &r[TLC >> 1], TLC);

-    // Carry from adding middle part has to be propagated to the highest limb.
-    for (size_t i = TLC + (TLC >> 1); i < 2 * TLC; i++)
-      r[i] = ptx::addc_cc(r[i], 0);
+      // Carry from adding middle part has to be propagated to the highest limb.
+      for (size_t i = TLC + (TLC >> 1); i < 2 * TLC; i++)
+        r[i] = ptx::addc_cc(r[i], 0);
+    } else if (TLC == 2) {
+      __align__(8) uint32_t odd[2];
+      r[0] = ptx::mul_lo(a[0], b[0]);
+      r[1] = ptx::mul_hi(a[0], b[0]);
+      r[2] = ptx::mul_lo(a[1], b[1]);
+      r[3] = ptx::mul_hi(a[1], b[1]);
+      odd[0] = ptx::mul_lo(a[0], b[1]);
+      odd[1] = ptx::mul_hi(a[0], b[1]);
+      odd[0] = ptx::mad_lo(a[1], b[0], odd[0]);
+      odd[1] = ptx::mad_hi(a[1], b[0], odd[1]);
+      r[1] = ptx::add_cc(r[1], odd[0]);
+      r[2] = ptx::addc_cc(r[2], odd[1]);
+      r[3] = ptx::addc(r[3], 0);
+    } else if (TLC == 1) {
+      r[0] = ptx::mul_lo(a[0], b[0]);
+      r[1] = ptx::mul_hi(a[0], b[0]);
+    }
  }

  static HOST_INLINE void multiply_raw_host(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
@@ -647,13 +685,13 @@ public:
  }

  static HOST_DEVICE_INLINE void
-  multiply_and_add_lsb_raw(const ff_storage& as, const ff_storage& bs, ff_storage& cs, ff_storage& rs)
+  multiply_and_add_lsb_neg_modulus_raw(const ff_storage& as, ff_storage& cs, ff_storage& rs)
  {
 #ifdef __CUDA_ARCH__
-    return multiply_and_add_lsb_raw_device(as, bs, cs, rs);
+    return multiply_and_add_lsb_neg_modulus_raw_device(as, cs, rs);
 #else
    Wide r_wide = {};
-    multiply_raw_host(as, bs, r_wide.limbs_storage);
+    multiply_raw_host(as, get_neg_modulus(), r_wide.limbs_storage);
    Field r = Wide::get_lower(r_wide);
    add_limbs<false>(cs, r.limbs_storage, rs);
 #endif
@@ -784,7 +822,7 @@ public:
    Field xs_lo = Wide::get_lower(xs);
    // Here we need to compute the lsb of `xs - l \cdot p` and to make use of fused multiply-and-add, we rewrite it as
    // `xs + l \cdot (2^{32 \cdot TLC}-p)` which is the same as original (up to higher limbs which we don't care about).
-    multiply_and_add_lsb_raw(l_hi.limbs_storage, get_neg_modulus(), xs_lo.limbs_storage, r.limbs_storage);
+    multiply_and_add_lsb_neg_modulus_raw(l_hi.limbs_storage, xs_lo.limbs_storage, r.limbs_storage);
    ff_storage r_reduced = {};
    uint32_t carry;
    // As mentioned, either 2 or 1 reduction can be performed depending on the field in question.
@@ -888,21 +926,24 @@ public:
    return rs;
  }

+  // Assumes the number is even!
  template <unsigned MODULUS_MULTIPLE = 1>
  static constexpr HOST_DEVICE_INLINE Field div2(const Field& xs)
  {
    const uint32_t* x = xs.limbs_storage.limbs;
    Field rs = {};
    uint32_t* r = rs.limbs_storage.limbs;
+    if constexpr (TLC > 1) {
 #ifdef __CUDA_ARCH__
-    UNROLL
+      UNROLL
 #endif
-    for (unsigned i = 0; i < TLC - 1; i++) {
+      for (unsigned i = 0; i < TLC - 1; i++) {
 #ifdef __CUDA_ARCH__
-      r[i] = __funnelshift_rc(x[i], x[i + 1], 1);
+        r[i] = __funnelshift_rc(x[i], x[i + 1], 1);
 #else
-      r[i] = (x[i] >> 1) | (x[i + 1] << 31);
+        r[i] = (x[i] >> 1) | (x[i + 1] << 31);
 #endif
+      }
    }
    r[TLC - 1] = x[TLC - 1] >> 1;
    return sub_modulus<MODULUS_MULTIPLE>(rs);
@@ -962,4 +1003,13 @@ struct std::hash<Field<CONFIG>> {
      hash ^= std::hash<uint32_t>()(key.limbs_storage.limbs[i]) + 0x9e3779b9 + (hash << 6) + (hash >> 2);
    return hash;
  }
+};
+
+template <class CONFIG>
+struct SharedMemory<Field<CONFIG>> {
+  __device__ Field<CONFIG>* getPointer()
+  {
+    extern __shared__ Field<CONFIG> s_scalar_[];
+    return s_scalar_;
+  }
 };
--- a/icicle/include/fields/field_config.cuh
+++ b/icicle/include/fields/field_config.cuh
@@ -0,0 +1,51 @@
+#pragma once
+#ifndef FIELD_CONFIG_H
+#define FIELD_CONFIG_H
+
+#include "fields/id.h"
+#include "fields/field.cuh"
+
+#if FIELD_ID == BN254
+#include "fields/snark_fields/bn254_scalar.cuh"
+using bn254::fp_config;
+#elif FIELD_ID == BLS12_381
+#include "fields/snark_fields/bls12_381_scalar.cuh"
+using bls12_381::fp_config;
+#elif FIELD_ID == BLS12_377
+#include "fields/snark_fields/bls12_377_scalar.cuh"
+using bls12_377::fp_config;
+#elif FIELD_ID == BW6_761
+#include "fields/snark_fields/bls12_377_base.cuh"
+typedef bls12_377::fq_config fp_config;
+#elif FIELD_ID == GRUMPKIN
+#include "fields/snark_fields/bn254_base.cuh"
+typedef bn254::fq_config fp_config;
+
+#elif FIELD_ID == BABY_BEAR
+#include "fields/stark_fields/baby_bear.cuh"
+using baby_bear::fp_config;
+#ifdef EXT_FIELD
+#include "fields/quartic_extension.cuh"
+#endif
+#endif
+
+/**
+ * @namespace field_config
+ * Namespace with type definitions for finite fields. Here, concrete types are created in accordance
+ * with the `-DFIELD` env variable passed during build.
+ */
+namespace field_config {
+  /**
+   * Scalar field. Is always a prime field.
+   */
+  typedef Field<fp_config> scalar_t;
+
+#ifdef EXT_FIELD
+  /**
+   * Extension field of `scalar_t` enabled if `-DEXT_FIELD` env variable is.
+   */
+  typedef ExtensionField<fp_config> extension_t;
+#endif
+} // namespace field_config
+
+#endif
--- a/icicle/include/fields/host_math.cuh
+++ b/icicle/include/fields/host_math.cuh
@@ -4,7 +4,7 @@

 #include <cstdint>
 #include <cuda_runtime.h>
-#include "common.cuh"
+#include "gpu-utils/modifiers.cuh"
 namespace host_math {

  // return x + y with uint32_t operands
--- a/icicle/include/fields/id.h
+++ b/icicle/include/fields/id.h
@@ -0,0 +1,13 @@
+#pragma once
+#ifndef FIELD_ID_H
+#define FIELD_ID_H
+
+#define BN254     1
+#define BLS12_381 2
+#define BLS12_377 3
+#define BW6_761   4
+#define GRUMPKIN  5
+
+#define BABY_BEAR 1001
+
+#endif
--- a/icicle/include/fields/ptx.cuh
+++ b/icicle/include/fields/ptx.cuh
--- a/icicle/include/fields/quadratic_extension.cuh
+++ b/icicle/include/fields/quadratic_extension.cuh
@@ -1,12 +1,15 @@
 #pragma once

 #include "field.cuh"
-#include "common.cuh"
+#include "gpu-utils/modifiers.cuh"
+#include "gpu-utils/sharedmem.cuh"

 template <typename CONFIG>
 class ExtensionField
 {
 private:
+  friend Field<CONFIG>;
+
  typedef typename Field<CONFIG>::Wide FWide;

  struct ExtensionWide {
@@ -47,6 +50,12 @@ public:

  static HOST_INLINE ExtensionField rand_host() { return ExtensionField{FF::rand_host(), FF::rand_host()}; }

+  static void RandHostMany(ExtensionField* out, int size)
+  {
+    for (int i = 0; i < size; i++)
+      out[i] = rand_host();
+  }
+
  template <unsigned REDUCTION_SIZE = 1>
  static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs)
  {
@@ -69,15 +78,47 @@ public:
    return ExtensionField{xs.real - ys.real, xs.imaginary - ys.imaginary};
  }

+  friend HOST_DEVICE_INLINE ExtensionField operator+(FF xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs + ys.real, ys.imaginary};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator-(FF xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs - ys.real, FF::neg(ys.imaginary)};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const FF& ys)
+  {
+    return ExtensionField{xs.real + ys, xs.imaginary};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const FF& ys)
+  {
+    return ExtensionField{xs.real - ys, xs.imaginary};
+  }
+
  template <unsigned MODULUS_MULTIPLE = 1>
  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
  {
    FWide real_prod = FF::mul_wide(xs.real, ys.real);
    FWide imaginary_prod = FF::mul_wide(xs.imaginary, ys.imaginary);
    FWide prod_of_sums = FF::mul_wide(xs.real + xs.imaginary, ys.real + ys.imaginary);
-    FWide i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(imaginary_prod);
-    i_sq_times_im = CONFIG::i_squared_is_negative ? FWide::neg(i_sq_times_im) : i_sq_times_im;
-    return ExtensionWide{real_prod + i_sq_times_im, prod_of_sums - real_prod - imaginary_prod};
+    FWide nonresidue_times_im = FF::template mul_unsigned<CONFIG::nonresidue>(imaginary_prod);
+    nonresidue_times_im = CONFIG::nonresidue_is_negative ? FWide::neg(nonresidue_times_im) : nonresidue_times_im;
+    return ExtensionWide{real_prod + nonresidue_times_im, prod_of_sums - real_prod - imaginary_prod};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
+  {
+    return ExtensionWide{FF::mul_wide(xs.real, ys), FF::mul_wide(xs.imaginary, ys)};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
+  {
+    return mul_wide(ys, xs);
  }

  template <unsigned MODULUS_MULTIPLE = 1>
@@ -87,7 +128,8 @@ public:
      FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.imaginary)};
  }

-  friend HOST_DEVICE_INLINE ExtensionField operator*(const ExtensionField& xs, const ExtensionField& ys)
+  template <class T1, class T2>
+  friend HOST_DEVICE_INLINE ExtensionField operator*(const T1& xs, const T2& ys)
  {
    ExtensionWide xy = mul_wide(xs, ys);
    return reduce(xy);
@@ -111,9 +153,9 @@ public:
    FF imaginary_prod = FF::template mul_const<mul_imaginary>(xs_imaginary);
    FF re_im = FF::template mul_const<mul_real>(xs_imaginary);
    FF im_re = FF::template mul_const<mul_imaginary>(xs_real);
-    FF i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(imaginary_prod);
-    i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im;
-    return ExtensionField{real_prod + i_sq_times_im, re_im + im_re};
+    FF nonresidue_times_im = FF::template mul_unsigned<CONFIG::nonresidue>(imaginary_prod);
+    nonresidue_times_im = CONFIG::nonresidue_is_negative ? FF::neg(nonresidue_times_im) : nonresidue_times_im;
+    return ExtensionField{real_prod + nonresidue_times_im, re_im + im_re};
  }

  template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
@@ -142,14 +184,23 @@ public:
    return ExtensionField{FF::neg(xs.real), FF::neg(xs.imaginary)};
  }

-  // inverse assumes that xs is nonzero
+  // inverse of zero is set to be zero which is what we want most of the time
  static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs)
  {
    ExtensionField xs_conjugate = {xs.real, FF::neg(xs.imaginary)};
-    FF i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(FF::sqr(xs.imaginary));
-    i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im;
+    FF nonresidue_times_im = FF::template mul_unsigned<CONFIG::nonresidue>(FF::sqr(xs.imaginary));
+    nonresidue_times_im = CONFIG::nonresidue_is_negative ? FF::neg(nonresidue_times_im) : nonresidue_times_im;
    // TODO: wide here
-    FF xs_norm_squared = FF::sqr(xs.real) - i_sq_times_im;
+    FF xs_norm_squared = FF::sqr(xs.real) - nonresidue_times_im;
    return xs_conjugate * ExtensionField{FF::inverse(xs_norm_squared), FF::zero()};
  }
+};
+
+template <class CONFIG>
+struct SharedMemory<ExtensionField<CONFIG>> {
+  __device__ ExtensionField<CONFIG>* getPointer()
+  {
+    extern __shared__ ExtensionField<CONFIG> s_ext2_scalar_[];
+    return s_ext2_scalar_;
+  }
 };
--- a/icicle/include/fields/quartic_extension.cuh
+++ b/icicle/include/fields/quartic_extension.cuh
@@ -0,0 +1,257 @@
+#pragma once
+
+#include "field.cuh"
+#include "gpu-utils/modifiers.cuh"
+#include "gpu-utils/sharedmem.cuh"
+
+template <typename CONFIG>
+class ExtensionField
+{
+private:
+  typedef typename Field<CONFIG>::Wide FWide;
+
+  struct ExtensionWide {
+    FWide real;
+    FWide im1;
+    FWide im2;
+    FWide im3;
+
+    friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
+    {
+      return ExtensionWide{xs.real + ys.real, xs.im1 + ys.im1, xs.im2 + ys.im2, xs.im3 + ys.im3};
+    }
+
+    friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
+    {
+      return ExtensionWide{xs.real - ys.real, xs.im1 - ys.im1, xs.im2 - ys.im2, xs.im3 - ys.im3};
+    }
+  };
+
+public:
+  typedef Field<CONFIG> FF;
+  static constexpr unsigned TLC = 4 * CONFIG::limbs_count;
+
+  FF real;
+  FF im1;
+  FF im2;
+  FF im3;
+
+  static constexpr HOST_DEVICE_INLINE ExtensionField zero()
+  {
+    return ExtensionField{FF::zero(), FF::zero(), FF::zero(), FF::zero()};
+  }
+
+  static constexpr HOST_DEVICE_INLINE ExtensionField one()
+  {
+    return ExtensionField{FF::one(), FF::zero(), FF::zero(), FF::zero()};
+  }
+
+  static constexpr HOST_DEVICE_INLINE ExtensionField ToMontgomery(const ExtensionField& xs)
+  {
+    return ExtensionField{
+      xs.real * FF{CONFIG::montgomery_r}, xs.im1 * FF{CONFIG::montgomery_r}, xs.im2 * FF{CONFIG::montgomery_r},
+      xs.im3 * FF{CONFIG::montgomery_r}};
+  }
+
+  static constexpr HOST_DEVICE_INLINE ExtensionField FromMontgomery(const ExtensionField& xs)
+  {
+    return ExtensionField{
+      xs.real * FF{CONFIG::montgomery_r_inv}, xs.im1 * FF{CONFIG::montgomery_r_inv},
+      xs.im2 * FF{CONFIG::montgomery_r_inv}, xs.im3 * FF{CONFIG::montgomery_r_inv}};
+  }
+
+  static HOST_INLINE ExtensionField rand_host()
+  {
+    return ExtensionField{FF::rand_host(), FF::rand_host(), FF::rand_host(), FF::rand_host()};
+  }
+
+  static void RandHostMany(ExtensionField* out, int size)
+  {
+    for (int i = 0; i < size; i++)
+      out[i] = rand_host();
+  }
+
+  template <unsigned REDUCTION_SIZE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs)
+  {
+    return ExtensionField{
+      FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.im1),
+      FF::sub_modulus<REDUCTION_SIZE>(&xs.im2), FF::sub_modulus<REDUCTION_SIZE>(&xs.im3)};
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const ExtensionField& xs)
+  {
+    os << "{ Real: " << xs.real << " }; { Im1: " << xs.im1 << " }; { Im2: " << xs.im2 << " }; { Im3: " << xs.im3
+       << " };";
+    return os;
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs.real + ys.real, xs.im1 + ys.im1, xs.im2 + ys.im2, xs.im3 + ys.im3};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs.real - ys.real, xs.im1 - ys.im1, xs.im2 - ys.im2, xs.im3 - ys.im3};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator+(FF xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs + ys.real, ys.im1, ys.im2, ys.im3};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator-(FF xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs - ys.real, FF::neg(ys.im1), FF::neg(ys.im2), FF::neg(ys.im3)};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const FF& ys)
+  {
+    return ExtensionField{xs.real + ys, xs.im1, xs.im2, xs.im3};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const FF& ys)
+  {
+    return ExtensionField{xs.real - ys, xs.im1, xs.im2, xs.im3};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
+  {
+    if (CONFIG::nonresidue_is_negative)
+      return ExtensionWide{
+        FF::mul_wide(xs.real, ys.real) -
+          FF::template mul_unsigned<CONFIG::nonresidue>(
+            FF::mul_wide(xs.im1, ys.im3) + FF::mul_wide(xs.im2, ys.im2) + FF::mul_wide(xs.im3, ys.im1)),
+        FF::mul_wide(xs.real, ys.im1) + FF::mul_wide(xs.im1, ys.real) -
+          FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im2, ys.im3) + FF::mul_wide(xs.im3, ys.im2)),
+        FF::mul_wide(xs.real, ys.im2) + FF::mul_wide(xs.im1, ys.im1) + FF::mul_wide(xs.im2, ys.real) -
+          FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im3, ys.im3)),
+        FF::mul_wide(xs.real, ys.im3) + FF::mul_wide(xs.im1, ys.im2) + FF::mul_wide(xs.im2, ys.im1) +
+          FF::mul_wide(xs.im3, ys.real)};
+    else
+      return ExtensionWide{
+        FF::mul_wide(xs.real, ys.real) +
+          FF::template mul_unsigned<CONFIG::nonresidue>(
+            FF::mul_wide(xs.im1, ys.im3) + FF::mul_wide(xs.im2, ys.im2) + FF::mul_wide(xs.im3, ys.im1)),
+        FF::mul_wide(xs.real, ys.im1) + FF::mul_wide(xs.im1, ys.real) +
+          FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im2, ys.im3) + FF::mul_wide(xs.im3, ys.im2)),
+        FF::mul_wide(xs.real, ys.im2) + FF::mul_wide(xs.im1, ys.im1) + FF::mul_wide(xs.im2, ys.real) +
+          FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im3, ys.im3)),
+        FF::mul_wide(xs.real, ys.im3) + FF::mul_wide(xs.im1, ys.im2) + FF::mul_wide(xs.im2, ys.im1) +
+          FF::mul_wide(xs.im3, ys.real)};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
+  {
+    return ExtensionWide{
+      FF::mul_wide(xs.real, ys), FF::mul_wide(xs.im1, ys), FF::mul_wide(xs.im2, ys), FF::mul_wide(xs.im3, ys)};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
+  {
+    return ExtensionWide{
+      FF::mul_wide(xs, ys.real), FF::mul_wide(xs, ys.im1), FF::mul_wide(xs, ys.im2), FF::mul_wide(xs, ys.im3)};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs)
+  {
+    return ExtensionField{
+      FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.im1),
+      FF::template reduce<MODULUS_MULTIPLE>(xs.im2), FF::template reduce<MODULUS_MULTIPLE>(xs.im3)};
+  }
+
+  template <class T1, class T2>
+  friend HOST_DEVICE_INLINE ExtensionField operator*(const T1& xs, const T2& ys)
+  {
+    ExtensionWide xy = mul_wide(xs, ys);
+    return reduce(xy);
+  }
+
+  friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys)
+  {
+    return (xs.real == ys.real) && (xs.im1 == ys.im1) && (xs.im2 == ys.im2) && (xs.im3 == ys.im3);
+  }
+
+  friend HOST_DEVICE_INLINE bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
+
+  template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs)
+  {
+    return {
+      FF::template mul_unsigned<multiplier>(xs.real), FF::template mul_unsigned<multiplier>(xs.im1),
+      FF::template mul_unsigned<multiplier>(xs.im2), FF::template mul_unsigned<multiplier>(xs.im3)};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide sqr_wide(const ExtensionField& xs)
+  {
+    // TODO: change to a more efficient squaring
+    return mul_wide<MODULUS_MULTIPLE>(xs, xs);
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField sqr(const ExtensionField& xs)
+  {
+    // TODO: change to a more efficient squaring
+    return xs * xs;
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField neg(const ExtensionField& xs)
+  {
+    return {FF::neg(xs.real), FF::neg(xs.im1), FF::neg(xs.im2), FF::neg(xs.im3)};
+  }
+
+  // inverse of zero is set to be zero which is what we want most of the time
+  static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs)
+  {
+    FF x, x0, x2;
+    if (CONFIG::nonresidue_is_negative) {
+      x0 = FF::reduce(
+        FF::sqr_wide(xs.real) +
+        FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im1, xs.im3 + xs.im3) - FF::sqr_wide(xs.im2)));
+      x2 = FF::reduce(
+        FF::mul_wide(xs.real, xs.im2 + xs.im2) - FF::sqr_wide(xs.im1) +
+        FF::template mul_unsigned<CONFIG::nonresidue>(FF::sqr_wide(xs.im3)));
+      x = FF::reduce(FF::sqr_wide(x0) + FF::template mul_unsigned<CONFIG::nonresidue>(FF::sqr_wide(x2)));
+    } else {
+      x0 = FF::reduce(
+        FF::sqr_wide(xs.real) -
+        FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im1, xs.im3 + xs.im3) - FF::sqr_wide(xs.im2)));
+      x2 = FF::reduce(
+        FF::mul_wide(xs.real, xs.im2 + xs.im2) - FF::sqr_wide(xs.im1) -
+        FF::template mul_unsigned<CONFIG::nonresidue>(FF::sqr_wide(xs.im3)));
+      x = FF::reduce(FF::sqr_wide(x0) - FF::template mul_unsigned<CONFIG::nonresidue>(FF::sqr_wide(x2)));
+    }
+    FF x_inv = FF::inverse(x);
+    x0 = x0 * x_inv;
+    x2 = x2 * x_inv;
+    return {
+      FF::reduce(
+        (CONFIG::nonresidue_is_negative
+           ? (FF::mul_wide(xs.real, x0) + FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im2, x2)))
+           : (FF::mul_wide(xs.real, x0)) - FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im2, x2)))),
+      FF::reduce(
+        (CONFIG::nonresidue_is_negative
+           ? FWide::neg(FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im3, x2)))
+           : FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im3, x2))) -
+        FF::mul_wide(xs.im1, x0)),
+      FF::reduce(FF::mul_wide(xs.im2, x0) - FF::mul_wide(xs.real, x2)),
+      FF::reduce(FF::mul_wide(xs.im1, x2) - FF::mul_wide(xs.im3, x0)),
+    };
+  }
+};
+
+template <class CONFIG>
+struct SharedMemory<ExtensionField<CONFIG>> {
+  __device__ ExtensionField<CONFIG>* getPointer()
+  {
+    extern __shared__ ExtensionField<CONFIG> s_ext4_scalar_[];
+    return s_ext4_scalar_;
+  }
+};
--- a/icicle/include/fields/snark_fields/bls12_377_base.cuh
+++ b/icicle/include/fields/snark_fields/bls12_377_base.cuh
@@ -1,196 +1,10 @@
 #pragma once
-#ifndef BLS12_377_PARAMS_H
-#define BLS12_377_PARAMS_H
+#ifndef BLS12_377_BASE_PARAMS_H
+#define BLS12_377_BASE_PARAMS_H

-#include "utils/storage.cuh"
+#include "fields/storage.cuh"

 namespace bls12_377 {
-  struct fp_config {
-    static constexpr unsigned limbs_count = 8;
-    static constexpr unsigned omegas_count = 47;
-    static constexpr unsigned modulus_bit_count = 253;
-    static constexpr unsigned num_of_reductions = 1;
-
-    static constexpr storage<limbs_count> modulus = {0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe,
-                                                     0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e};
-    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0x14230000, 0xa0000002, 0xb354edfd,
-                                                       0xb86f6002, 0xc1689a3c, 0x34594aac, 0x2556cabd};
-    static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0x28460000, 0x40000004, 0x66a9dbfb,
-                                                       0x70dec005, 0x82d13479, 0x68b29559, 0x4aad957a};
-    static constexpr storage<limbs_count> neg_modulus = {0xffffffff, 0xf5ee7fff, 0x2ffffffe, 0xa6558901,
-                                                         0xa3c84ffe, 0x9f4bb2e1, 0x65d35aa9, 0xed549aa1};
-    static constexpr storage<2 * limbs_count> modulus_wide = {
-      0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {
-      0x00000001, 0x14230000, 0xe0000002, 0xc7dd4d2f, 0x8585d003, 0x08ee1bd4, 0xe57fc56e, 0x7e7557e3,
-      0x483a709d, 0x1fdebb41, 0x5678f4e6, 0x8ea77334, 0xc19c3ec5, 0xd717de29, 0xe2340781, 0x015c8d01};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
-      0x00000002, 0x28460000, 0xc0000004, 0x8fba9a5f, 0x0b0ba007, 0x11dc37a9, 0xcaff8adc, 0xfceaafc7,
-      0x9074e13a, 0x3fbd7682, 0xacf1e9cc, 0x1d4ee668, 0x83387d8b, 0xae2fbc53, 0xc4680f03, 0x02b91a03};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x00000004, 0x508c0000, 0x80000008, 0x1f7534bf, 0x1617400f, 0x23b86f52, 0x95ff15b8, 0xf9d55f8f,
-      0x20e9c275, 0x7f7aed05, 0x59e3d398, 0x3a9dccd1, 0x0670fb16, 0x5c5f78a7, 0x88d01e07, 0x05723407};
-
-    static constexpr storage<limbs_count> m = {0x151e79ea, 0xf5204c21, 0x8d69e258, 0xfd0a180b,
-                                               0xfaa80548, 0xe4e51e49, 0xc40b2c9e, 0x36d9491e};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xfffffff3, 0x7d1c7fff, 0x6ffffff2, 0x7257f50f,
-                                                          0x512c0fee, 0x16d81575, 0x2bbb9a9d, 0x0d4bda32};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x1beeec02, 0x4122dd1a, 0x74fee875, 0xbd1eae95,
-                                                              0x27b28e2f, 0x838557e2, 0x2290c02c, 0x07b30191};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega = {
-      {{0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e},
-       {0x00000001, 0x8f1a4000, 0xb0000001, 0xcf664765, 0x970dec00, 0x23ed1347, 0x00000000, 0x00000000},
-       {0xfbfa0a01, 0x0f830f7e, 0xd75769a0, 0x20f8b46c, 0xf05d5033, 0x7108bd18, 0x0788de01, 0x07405e08},
-       {0x60b9bdae, 0xc78085a6, 0x789094f5, 0x3116ec22, 0xce87d660, 0x0a02a81d, 0xc2a94856, 0x0ead8236},
-       {0x3e83a7cc, 0x6ffc39d9, 0x958a0a74, 0x117d996e, 0x0b92e8c9, 0xc242289d, 0x29d977d6, 0x0484efb4},
-       {0x0111ec3f, 0x15455b00, 0xc5f6be6f, 0x6b62d7af, 0x337f2d07, 0xfcba0365, 0x43fccd26, 0x0f151842},
-       {0xc31ec69b, 0x57951b2e, 0x2a37ce1f, 0x3e0a4be7, 0xcf3b198a, 0x960aeb4a, 0x341fd5cd, 0x04fb0673},
-       {0xa921851f, 0x71c1b78e, 0x7808f239, 0x3c26340c, 0x976fb990, 0xbcc8f69b, 0xe880dc71, 0x06a5edb2},
-       {0xc0f5679e, 0x7619eab5, 0x0dc0b9cd, 0x1f4cd10e, 0xbf6a480a, 0x7e1b70aa, 0x7f5461bb, 0x0ffc66da},
-       {0xec5cbab2, 0x8159806d, 0x498264a3, 0x14ea1333, 0xe3abfaa6, 0x56bbe1d8, 0x02aa031f, 0x09d2b5c4},
-       {0xc010c48a, 0xd2aa9562, 0x3b004b60, 0x447e5c11, 0x11e243bb, 0xd5a21c13, 0x0ab418b1, 0x01eab23e},
-       {0xacff6986, 0x08715ee8, 0xa93924d0, 0xab01878a, 0x6e9ae5c4, 0xbfbc5e71, 0x26b08d6e, 0x0f8000bf},
-       {0x3ddbc679, 0x06bc13b0, 0x615256ce, 0x7269a1f1, 0x1f5221a2, 0xf7716fbf, 0x8c66c14f, 0x0fa1f02c},
-       {0x906f531f, 0xdd40f131, 0x30728eff, 0xb06b29c7, 0x88839294, 0xc891fd19, 0x646978e8, 0x04e88447},
-       {0x6e259cdc, 0xb1e4b769, 0x00514e5e, 0xbcb0b709, 0x05113e7f, 0x74edb7c0, 0xe92e22af, 0x10c88511},
-       {0x240ede5b, 0xebb2e898, 0x42cd84c6, 0xc2639185, 0x9408f956, 0xf79e8391, 0x94e87a7d, 0x06872fa1},
-       {0x260678ff, 0xf8522249, 0xa8de9973, 0x6148cb16, 0x5a4e8d56, 0x5750f3f4, 0xbaeaf0c3, 0x0e805156},
-       {0x3d766f80, 0x1b4b71cf, 0x1069012d, 0x47d21195, 0x9151ebec, 0x5635235f, 0x2b13c808, 0x093f7d91},
-       {0x4637701d, 0x0848f958, 0x4c8353af, 0x8a750076, 0x0ef6174a, 0x485f4e4f, 0xf38db632, 0x078d97a1},
-       {0x66a16869, 0x50c487c1, 0xd1fd4525, 0x380a66ab, 0x265e8539, 0xd455a01a, 0x064b5334, 0x0cd62875},
-       {0x3358eb25, 0xdbc547bc, 0x722037db, 0x8909d398, 0x5e705b6d, 0x8b7075b5, 0x9bdaf407, 0x02694bb2},
-       {0xf45b9621, 0x102fbfb0, 0xf04faac0, 0xe80f4241, 0x7ca61177, 0x0b830bfd, 0x7033169d, 0x10521892},
-       {0xcc943028, 0xed2576ad, 0xfa4c6090, 0x846e49bc, 0x0049d8e6, 0xc74c1865, 0x665d7be5, 0x0e9c5a12},
-       {0xafeb494b, 0x97319dcd, 0x1d78404c, 0xab30c83e, 0xf26ffe90, 0x452d8a48, 0xa36452c7, 0x0bfc2e92},
-       {0xedc626c3, 0xf30e312d, 0xcf1f3a94, 0x8367a7ca, 0x917a1b28, 0x621e15e1, 0xf2e93b82, 0x07cd59f8},
-       {0xf02ba42c, 0x553085d9, 0x1119b10d, 0x59662159, 0x6b8ea03f, 0xaa670958, 0x7ce92983, 0x066f6f5f},
-       {0x4dd87a5e, 0xf423a283, 0xd9a4c364, 0x1fe46601, 0xbfdc7e9b, 0xda4addbf, 0x3bf94b2b, 0x0a7f2bd8},
-       {0xe5f8848a, 0x270a2326, 0xa727567d, 0x97d14afa, 0x48746fc7, 0x1a3a5a4e, 0xa42f077a, 0x0044e4b1},
-       {0x20b7298a, 0xd7652451, 0x65013b06, 0xc7c9a0b7, 0xad0d8457, 0x479b82a9, 0x0c99f5ce, 0x0bef1e5a},
-       {0x1912f7fa, 0x77d7da1d, 0x299fd7d6, 0xbcb7a5b2, 0x142a4480, 0x705e45dd, 0xb492dbd8, 0x0dc835fd},
-       {0xa0234d2d, 0xe943054c, 0xe5f5be5e, 0x673b0ee0, 0x5048a19a, 0xcdd48e41, 0xabc3cb99, 0x0997d277},
-       {0xa9966ac4, 0x1ae0ea67, 0xda83fb3b, 0x4e2dbb1c, 0x0b51380e, 0xf77cf749, 0xb28a7670, 0x048b4b0e},
-       {0xb14361d4, 0x7f1db43f, 0x25ab6d51, 0x7927e578, 0x383bf21e, 0xb43e52a5, 0xd27fa99f, 0x077595e9},
-       {0xa90a2740, 0xfe3ca4f0, 0x512a7c7a, 0xd259ff36, 0xb41fe696, 0xbca3176a, 0xf33132ce, 0x05bd5ea3},
-       {0xf284f768, 0xdeee484b, 0xe26a0475, 0x2a02e015, 0x88d968c2, 0xf0eb4925, 0x82a391c9, 0x0620ce9e},
-       {0xbd83a3da, 0xd3b69b29, 0xe02ce197, 0x9543950f, 0xc2f87783, 0x80799665, 0xc15be215, 0x11ce8199},
-       {0x1b29736e, 0x8f267f19, 0x1d5a0c3a, 0xa2e04d58, 0x1ae99514, 0x76803064, 0x57f7c806, 0x12129439},
-       {0xf32d6bac, 0xa0b973d4, 0xf0d81b72, 0xae951889, 0x2e2daa0a, 0x51dbe098, 0x40d9af8f, 0x04679474},
-       {0x22df9f13, 0x56313de8, 0x599e7536, 0xe2e75200, 0x6d163e50, 0xa1b4fce7, 0xc8111763, 0x0aec2172},
-       {0x355dd694, 0x4258374d, 0x44c76a20, 0x5c31e8ac, 0xaa5fd062, 0x9b473969, 0x1a37b6b4, 0x0a693d77},
-       {0x44ddbbdc, 0xbafb92a6, 0x26b01974, 0x63c7a02d, 0x5f28a274, 0x0ff86e13, 0x867f2e29, 0x0a7b462a},
-       {0xd5fba57b, 0x90684fea, 0xe0defe98, 0xed237883, 0x030ae924, 0xc502b692, 0xe7a1ec2c, 0x08aa58e8},
-       {0x5e9020dd, 0xade9d4b4, 0x87db8813, 0x489259d2, 0x25051238, 0x5ddce740, 0xb5bc4d11, 0x0c775db1},
-       {0x293f8481, 0xd52cc17a, 0x6f133205, 0x041178fb, 0xb2961832, 0xbbc70d18, 0x481760cd, 0x073d34d1},
-       {0xfdacff58, 0x8215b91d, 0x98331645, 0xd8d9177d, 0x439e803c, 0xe85223ad, 0xcca42c1f, 0x04aa8ef0},
-       {0x01ab3a4d, 0x006f60fa, 0x814ba450, 0xe6600e15, 0xdf9eb147, 0xbde4df36, 0x33760d7b, 0x055d58fa},
-       {0xec2a895e, 0x476ef4a4, 0x63e3f04a, 0x9b506ee3, 0xd1a8a12f, 0x60c69477, 0x0cb92cc1, 0x11d4b7f6}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
-      {{0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e},
-       {0x00000000, 0x7af74000, 0x1fffffff, 0x8a442f99, 0xc529c400, 0x3cc739d6, 0x9a2ca556, 0x12ab655e},
-       {0xd60fb046, 0xc9fa190c, 0xc5b4674e, 0xdb5c179b, 0xbc7b8726, 0x2b2bce0b, 0xbf6e69bf, 0x0e4eb338},
-       {0x8ffc4ed5, 0x74732d1f, 0xb7f2eefc, 0x42d9f590, 0xa24dd4dd, 0xf70461e5, 0xef64676f, 0x03b6eba4},
-       {0x102bbab0, 0x5a21f98a, 0x8d8e2efb, 0xa6a147a9, 0x7612906f, 0x0eb4f005, 0x47d8d2e3, 0x0e1a5481},
-       {0xd01e5aa8, 0x6e509add, 0x6e3f123d, 0xe1582468, 0x8274db24, 0xbd6313ee, 0xd173a634, 0x05d5836e},
-       {0xe975c0cf, 0x6aab3344, 0x6f1dc38e, 0xca362e0e, 0x1dd1743a, 0x2fe72cda, 0xc1b4c4c2, 0x0c1c956e},
-       {0xec89a64f, 0x59fe97a0, 0xe8de5d4c, 0x579617d7, 0xc9c1ea7b, 0x256a305b, 0x53fa131b, 0x01ffae4e},
-       {0x29bcb088, 0x463a73ff, 0xe1438e80, 0xee9e9a5e, 0x3c9369e4, 0x2a00951f, 0x80a32052, 0x09711183},
-       {0x4bec8dd2, 0xa36899db, 0x96393687, 0x2946872e, 0x842df3c8, 0xd4b5734f, 0x5f5cd8fb, 0x0834098f},
-       {0xe3c711b9, 0x4bc485f6, 0x648d1d7e, 0xf43a2598, 0xee88abaa, 0x7f981a0e, 0xec6a3f27, 0x0c88c9c3},
-       {0x49046b52, 0x42bcc6c2, 0x56ab9ecc, 0xcc77294a, 0xe4df3ddd, 0x02ecb41a, 0x67f76726, 0x0e567d22},
-       {0x91c64fc2, 0x1cc56cc3, 0xd16a490b, 0x8cb71e65, 0x14fac366, 0x984be37e, 0xa25d7ba5, 0x0a08e032},
-       {0xd4f5941e, 0x966d9739, 0xe5772a73, 0x5805deb6, 0x5c1f970c, 0xe4eb0d33, 0xbdf35409, 0x039715db},
-       {0xcc6518ac, 0x8419686c, 0x9c7a2366, 0x96dec3a8, 0x71724384, 0xefbfcac6, 0xaf34c239, 0x0c44b99a},
-       {0xc18ff4fd, 0xcb66fe1b, 0x86c8d586, 0x588e18b3, 0x1dfab57c, 0xc6e6d2a3, 0x7d7d4efd, 0x10918ad2},
-       {0x97a18f58, 0x56d6cf22, 0xd0d7abd9, 0x11710758, 0x5eb7a9c5, 0xd1a6608b, 0xc4937e38, 0x04059bdb},
-       {0x4b1b63a9, 0x12998cbc, 0xcf420c9f, 0x0f780c6c, 0x129289ad, 0xa5e48723, 0x240a141d, 0x0a3a1223},
-       {0x00db2b48, 0xa43c0e02, 0x933d10ee, 0x76585489, 0xc0ba6a80, 0x12d64af1, 0x2fad8d8e, 0x01940f43},
-       {0x1d75bec9, 0xe29ef6c0, 0xd4b0183b, 0xead287a2, 0xedfd3795, 0x75a017cf, 0x64427c8e, 0x107f8d0f},
-       {0xa26c8c12, 0xa6f4e1d1, 0xf6610f7e, 0x13571553, 0x56701caf, 0xd95e5df6, 0x2263d69d, 0x050e7b89},
-       {0xc161761f, 0x271d7caf, 0xc369a371, 0xf1001d6f, 0x00e60f51, 0x65286415, 0xb74d14b8, 0x00b918f9},
-       {0x03ad3139, 0x01d3f431, 0xa137ce16, 0xe56f6002, 0x1deb42e8, 0x97f53369, 0xaa37cddd, 0x033fa9ac},
-       {0x60cf1330, 0x840f913b, 0x1df5ed87, 0x5610cde6, 0x72b36ddf, 0x858381b0, 0x6f64e0b7, 0x109bf66c},
-       {0x930cee0b, 0x432d3626, 0xf26e8ba3, 0x55ed3efb, 0x14c5457f, 0x802eebcc, 0xe2310f22, 0x00d300e3},
-       {0x4b9ac952, 0x3d29f5ba, 0xc8ea8f94, 0x7c7f2662, 0xcefc3052, 0x736ccb63, 0x0981f3cb, 0x04bfce2f},
-       {0x5d4e643c, 0x3da791ea, 0x85bff013, 0xb6a956ef, 0xd73de6a3, 0x86c629a8, 0x6b8c48a9, 0x0a5a5f55},
-       {0x49c6284a, 0x9ba6aa00, 0xeacbdc63, 0x0b8429fb, 0xedafdf37, 0x9b9c6c5b, 0xad0c78c6, 0x009907e8},
-       {0x3e47b53f, 0x50380ce2, 0x3a9613fc, 0x6ea3c2d3, 0x4c87ab50, 0xfe743105, 0xd192221c, 0x07871979},
-       {0xe978594b, 0x4ddd3320, 0x3abe3f79, 0xe5f36fbe, 0xe4dcff8e, 0x5dba9ef2, 0x7105148f, 0x0bfc27e2},
-       {0x498fb549, 0xd5993cd5, 0x09da9272, 0x718adcee, 0x72bd5bc0, 0x9e03cbb4, 0xc592813f, 0x07206942},
-       {0x78fd3239, 0xaf29730b, 0x40c3e723, 0xbd907ac9, 0x77f214f7, 0x5dcc0aad, 0xb05fb3a1, 0x02d958da},
-       {0xdf80223d, 0x55f432c9, 0x11a2fed9, 0x23daf2f6, 0x41ae8c34, 0x9e43e003, 0x95f22373, 0x0d51533b},
-       {0x7998b62c, 0xbb53132b, 0x22c9b4aa, 0x064a9186, 0x71d61334, 0xd56de253, 0x04e416f6, 0x10fcf25f},
-       {0xdddb58ec, 0x41f8042f, 0x10886d85, 0x7dd54384, 0x622ff4b4, 0x19544f90, 0x050cc539, 0x02f0b49a},
-       {0xa39b02a3, 0x8a3de898, 0xdc94422c, 0x068b2992, 0xf493db31, 0x1c5f019a, 0x11b0f668, 0x066b1790},
-       {0x78500f1a, 0x98310dd7, 0x735ccb27, 0x1c6050bf, 0xb2081df4, 0x07b6fa7f, 0xfa0f1e20, 0x003edf24},
-       {0x89b0ca6f, 0xb4d938e2, 0x2c897570, 0x0214eb59, 0x2d4cf27a, 0x56c45327, 0x3ed546a4, 0x10a2f358},
-       {0xef01ed78, 0xf2828212, 0xf103c9ca, 0xa66094ac, 0x7a2d5573, 0xdceb481d, 0x8af46aab, 0x0190fcde},
-       {0x526bf9fc, 0x023031cc, 0x79c209ba, 0x0e4136c0, 0x3ec42e5c, 0xe5234df1, 0x1d455234, 0x00cb9592},
-       {0x33bf2a1c, 0x842b0c9c, 0xa29b9236, 0x1fd43c95, 0xc06795d3, 0x6b37a603, 0x0c1b712a, 0x00017b17},
-       {0xaf858193, 0x2b955be2, 0x5fb5e378, 0xa513d8be, 0xa326aeb9, 0x88c4ebeb, 0xf3d45990, 0x00c378e2},
-       {0x6464580f, 0x33e6c8c0, 0x3c4aa09f, 0x9d560eb3, 0xcc98f404, 0xb3f1a899, 0x8ca24b48, 0x012c1ea5},
-       {0xe3b4dc56, 0xa0594a67, 0x91b698e1, 0xc8e6b582, 0x8df78057, 0x711cadbf, 0x396466f8, 0x0049abdf},
-       {0x4ffa086a, 0xecc89610, 0xca06afc6, 0x4db82291, 0x8f3a6426, 0x9ae7c68c, 0x2a874432, 0x0b3dae8c},
-       {0x3b3625b6, 0x1e62401f, 0x28471e5a, 0xd0692164, 0x5cad6b77, 0xb85aa9ec, 0xaa95acf2, 0x063e4b66},
-       {0xb9112c51, 0x2542c2b2, 0x6e23b3ce, 0x36ead8da, 0x76476754, 0x9a268d13, 0xa1ad7cf1, 0x121f44ad}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> inv = {
-      {{0x00000001, 0x8508c000, 0x68000000, 0xacd53b7f, 0x2e1bd800, 0x305a268f, 0x4d1652ab, 0x0955b2af},
-       {0x00000001, 0xc78d2000, 0x1c000000, 0x033fd93f, 0xc529c401, 0xc88739d6, 0xf3a17c00, 0x0e008c06},
-       {0x00000001, 0xe8cf5000, 0xf6000000, 0x2e75281e, 0x90b0ba01, 0x949dc37a, 0xc6e710ab, 0x1055f8b2},
-       {0x00000001, 0xf9706800, 0xe3000000, 0x440fcf8e, 0x76743501, 0xfaa9084c, 0xb089db00, 0x1180af08},
-       {0x00000001, 0x01c0f400, 0xd9800001, 0x4edd2346, 0x6955f281, 0xadaeaab5, 0xa55b402b, 0x12160a33},
-       {0x00000001, 0x05e93a00, 0xd4c00001, 0x5443cd22, 0xe2c6d141, 0x07317be9, 0x1fc3f2c1, 0x1260b7c9},
-       {0x00000001, 0x07fd5d00, 0xd2600001, 0x56f72210, 0x1f7f40a1, 0xb3f2e484, 0xdcf84c0b, 0x12860e93},
-       {0x00000001, 0x09076e80, 0xd1300001, 0x5850cc87, 0x3ddb7851, 0x0a5398d1, 0x3b9278b1, 0x1298b9f9},
-       {0x00000001, 0x098c7740, 0x50980001, 0x58fda1c3, 0xcd099429, 0xb583f2f7, 0xeadf8f03, 0x12a20fab},
-       {0x00000001, 0x09cefba0, 0x104c0001, 0x59540c61, 0x14a0a215, 0x0b1c200b, 0x42861a2d, 0x12a6ba85},
-       {0x00000001, 0x09f03dd0, 0xf0260001, 0x597f41af, 0xb86c290b, 0xb5e83694, 0xee595fc1, 0x12a90ff1},
-       {0x00000001, 0x0a00dee8, 0x60130001, 0x5994dc57, 0x8a51ec86, 0x0b4e41d9, 0x4443028c, 0x12aa3aa8},
-       {0x00000001, 0x0a092f74, 0x18098001, 0xd99fa9ab, 0xf344ce43, 0x3601477b, 0x6f37d3f1, 0x12aad003},
-       {0x00000001, 0x0a0d57ba, 0xf404c001, 0x99a51054, 0x27be3f22, 0xcb5aca4d, 0x04b23ca3, 0x12ab1ab1},
-       {0x00000001, 0x0a0f6bdd, 0xe2026001, 0xf9a7c3a9, 0xc1faf791, 0x16078bb5, 0xcf6f70fd, 0x12ab4007},
-       {0x80000001, 0x0a1075ee, 0x59013001, 0xa9a91d54, 0x0f1953c9, 0xbb5dec6a, 0x34ce0b29, 0x12ab52b3},
-       {0x40000001, 0x0a10faf7, 0x94809801, 0x81a9ca29, 0x35a881e5, 0x0e091cc4, 0xe77d5840, 0x12ab5c08},
-       {0xa0000001, 0x0a113d7b, 0x32404c01, 0x6daa2094, 0x48f018f3, 0x375eb4f1, 0xc0d4fecb, 0x12ab60b3},
-       {0xd0000001, 0x0a115ebd, 0x81202601, 0x63aa4bc9, 0xd293e47a, 0xcc098107, 0x2d80d210, 0x12ab6309},
-       {0xe8000001, 0x0a116f5e, 0x28901301, 0xdeaa6164, 0x1765ca3d, 0x965ee713, 0xe3d6bbb3, 0x12ab6433},
-       {0x74000001, 0x0a1177af, 0x7c480981, 0x9c2a6c31, 0xb9cebd1f, 0xfb899a18, 0x3f01b084, 0x12ab64c9},
-       {0xba000001, 0x0a117bd7, 0x262404c1, 0x7aea7198, 0x8b033690, 0xae1ef39b, 0xec972aed, 0x12ab6513},
-       {0xdd000001, 0x0a117deb, 0x7b120261, 0xea4a744b, 0xf39d7348, 0x0769a05c, 0x4361e822, 0x12ab6539},
-       {0xee800001, 0x0a117ef5, 0x25890131, 0x21fa75a5, 0xa7ea91a5, 0x340ef6bd, 0xeec746bc, 0x12ab654b},
-       {0xf7400001, 0x0a117f7a, 0xfac48099, 0x3dd27651, 0x021120d3, 0x4a61a1ee, 0x4479f609, 0x12ab6555},
-       {0x7ba00001, 0x0a117fbd, 0x6562404d, 0x4bbe76a8, 0x2f24686a, 0xd58af786, 0xef534daf, 0x12ab6559},
-       {0xbdd00001, 0x0a117fde, 0x9ab12027, 0xd2b476d3, 0x45ae0c35, 0x1b1fa252, 0x44bff983, 0x12ab655c},
-       {0x5ee80001, 0x0a117fef, 0x35589014, 0x962f76e9, 0x50f2de1b, 0xbde9f7b8, 0x6f764f6c, 0x12ab655d},
-       {0xaf740001, 0x8a117ff7, 0x02ac480a, 0x77ecf6f4, 0x5695470e, 0x8f4f226b, 0x04d17a61, 0x12ab655e},
-       {0xd7ba0001, 0xca117ffb, 0x69562405, 0xe8cbb6f9, 0xd9667b87, 0xf801b7c4, 0x4f7f0fdb, 0x12ab655e},
-       {0xebdd0001, 0x6a117ffd, 0x1cab1203, 0xa13b16fc, 0x9acf15c4, 0x2c5b0271, 0x74d5da99, 0x12ab655e},
-       {0xf5ee8001, 0x3a117ffe, 0x76558902, 0xfd72c6fd, 0xfb8362e2, 0xc687a7c7, 0x87813ff7, 0x12ab655e},
-       {0x7af74001, 0xa2117fff, 0x232ac481, 0x2b8e9efe, 0x2bdd8972, 0x139dfa73, 0x90d6f2a7, 0x12ab655e},
-       {0xbd7ba001, 0x56117fff, 0x79956241, 0xc29c8afe, 0xc40a9cb9, 0xba2923c8, 0x9581cbfe, 0x12ab655e},
-       {0xdebdd001, 0x30117fff, 0xa4cab121, 0x8e2380fe, 0x9021265d, 0x8d6eb873, 0x97d738aa, 0x12ab655e},
-       {0xef5ee801, 0x1d117fff, 0xba655891, 0x73e6fbfe, 0xf62c6b2f, 0x771182c8, 0x9901ef00, 0x12ab655e},
-       {0xf7af7401, 0x13917fff, 0xc532ac49, 0x66c8b97e, 0xa9320d98, 0x6be2e7f3, 0x99974a2b, 0x12ab655e},
-       {0xfbd7ba01, 0x0ed17fff, 0xca995625, 0xe039983e, 0x02b4decc, 0xe64b9a89, 0x99e1f7c0, 0x12ab655e},
-       {0xfdebdd01, 0x0c717fff, 0xcd4cab13, 0x1cf2079e, 0xaf764767, 0xa37ff3d3, 0x9a074e8b, 0x12ab655e},
-       {0xfef5ee81, 0x0b417fff, 0xcea6558a, 0x3b4e3f4e, 0x05d6fbb4, 0x021a2079, 0x9a19f9f1, 0x12ab655e},
-       {0xff7af741, 0x8aa97fff, 0xcf532ac5, 0xca7c5b26, 0xb10755da, 0xb16736cb, 0x9a234fa3, 0x12ab655e},
-       {0xffbd7ba1, 0x4a5d7fff, 0xcfa99563, 0x12136912, 0x069f82ee, 0x090dc1f5, 0x9a27fa7d, 0x12ab655e},
-       {0xffdebdd1, 0x2a377fff, 0xcfd4cab2, 0xb5def008, 0xb16b9977, 0xb4e10789, 0x9a2a4fe9, 0x12ab655e},
-       {0xffef5ee9, 0x9a247fff, 0xcfea6559, 0x87c4b383, 0x06d1a4bc, 0x0acaaa54, 0x9a2b7aa0, 0x12ab655e},
-       {0xfff7af75, 0x521affff, 0x4ff532ad, 0xf0b79541, 0x3184aa5e, 0x35bf7bb9, 0x9a2c0ffb, 0x12ab655e},
-       {0xfffbd7bb, 0x2e163fff, 0x0ffa9957, 0x25310620, 0xc6de2d30, 0xcb39e46b, 0x9a2c5aa8, 0x12ab655e},
-       {0xfffdebde, 0x1c13dfff, 0x6ffd4cac, 0xbf6dbe8f, 0x118aee98, 0x95f718c5, 0x9a2c7fff, 0x12ab655e}}};
-  };
-
  struct fq_config {
    static constexpr unsigned limbs_count = 12;
    static constexpr unsigned omegas_count = 48;
@@ -521,41 +335,11 @@ namespace bls12_377 {
       {0xfffdebde, 0x0ff7ffff, 0x0fffa3d3, 0x8e4c751f, 0x6bcccc32, 0xb7275e5b, 0xdc08ab03, 0x0321276d, 0x28f6304f,
        0xdd22a6ac, 0x17c50a31, 0x01ae3a46}}};

-    // i^2, the square of the imaginary unit for the extension field
-    static constexpr uint32_t i_squared = 5;
-    // true if i^2 is negative
-    static constexpr bool i_squared_is_negative = true;
+    // nonresidue to generate the extension field
+    static constexpr uint32_t nonresidue = 5;
+    // true if nonresidue is negative
+    static constexpr bool nonresidue_is_negative = true;
  };
-
-  // G1 and G2 generators
-  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0xb21be9ef, 0xeab9b16e, 0xffcd394e, 0xd5481512,
-                                                               0xbd37cb5c, 0x188282c8, 0xaa9d41bb, 0x85951e2c,
-                                                               0xbf87ff54, 0xc8fc6225, 0xfe740a67, 0x008848de};
-  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x559c8ea6, 0xfd82de55, 0x34a9591a, 0xc2fe3d36,
-                                                               0x4fb82305, 0x6d182ad4, 0xca3e52d9, 0xbd7fb348,
-                                                               0x30afeec4, 0x1f674f5d, 0xc5102eff, 0x01914a69};
-  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0x7c005196, 0x74e3e48f, 0xbb535402, 0x71889f52,
-                                                                  0x57db6b9b, 0x7ea501f5, 0x203e5031, 0xc565f071,
-                                                                  0xa3841d01, 0xc89630a2, 0x71c785fe, 0x018480be};
-  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0x6ea16afe, 0xb26bfefa, 0xbff76fe6, 0x5cf89984,
-                                                                  0x0799c9de, 0xe7223ece, 0x6651cecb, 0x532777ee,
-                                                                  0xb1b140d5, 0x70dc5a51, 0xe7004031, 0x00ea6040};
-  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x09fd4ddf, 0xf0940944, 0x6d8c7c2e, 0xf2cf8888,
-                                                                  0xf832d204, 0xe458c282, 0x74b49a58, 0xde03ed72,
-                                                                  0xcbb2efb4, 0xd960736b, 0x5d446f7b, 0x00690d66};
-  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0x85eb8f93, 0xd9a1cdd1, 0x5e52270b, 0x4279b83f,
-                                                                  0xcee304c2, 0x2463b01a, 0x3d591bf1, 0x61ef11ac,
-                                                                  0x151a70aa, 0x9e549da3, 0xd2835518, 0x00f8169f};
-
-  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
-    0x9999999a, 0x1c9ed999, 0x1ccccccd, 0x0dd39e5c, 0x3c6bf800, 0x129207b6,
-    0xcd5fd889, 0xdc7b4f91, 0x7460c589, 0x43bd0373, 0xdb0fd6f3, 0x010222f6};
 } // namespace bls12_377

-#endif
+#endif
--- a/icicle/include/fields/snark_fields/bls12_377_scalar.cuh
+++ b/icicle/include/fields/snark_fields/bls12_377_scalar.cuh
@@ -0,0 +1,195 @@
+#pragma once
+#ifndef BLS12_377_SCALAR_PARAMS_H
+#define BLS12_377_SCALAR_PARAMS_H
+
+#include "fields/storage.cuh"
+
+namespace bls12_377 {
+  struct fp_config {
+    static constexpr unsigned limbs_count = 8;
+    static constexpr unsigned omegas_count = 47;
+    static constexpr unsigned modulus_bit_count = 253;
+    static constexpr unsigned num_of_reductions = 1;
+
+    static constexpr storage<limbs_count> modulus = {0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe,
+                                                     0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e};
+    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0x14230000, 0xa0000002, 0xb354edfd,
+                                                       0xb86f6002, 0xc1689a3c, 0x34594aac, 0x2556cabd};
+    static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0x28460000, 0x40000004, 0x66a9dbfb,
+                                                       0x70dec005, 0x82d13479, 0x68b29559, 0x4aad957a};
+    static constexpr storage<limbs_count> neg_modulus = {0xffffffff, 0xf5ee7fff, 0x2ffffffe, 0xa6558901,
+                                                         0xa3c84ffe, 0x9f4bb2e1, 0x65d35aa9, 0xed549aa1};
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0x00000001, 0x14230000, 0xe0000002, 0xc7dd4d2f, 0x8585d003, 0x08ee1bd4, 0xe57fc56e, 0x7e7557e3,
+      0x483a709d, 0x1fdebb41, 0x5678f4e6, 0x8ea77334, 0xc19c3ec5, 0xd717de29, 0xe2340781, 0x015c8d01};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0x00000002, 0x28460000, 0xc0000004, 0x8fba9a5f, 0x0b0ba007, 0x11dc37a9, 0xcaff8adc, 0xfceaafc7,
+      0x9074e13a, 0x3fbd7682, 0xacf1e9cc, 0x1d4ee668, 0x83387d8b, 0xae2fbc53, 0xc4680f03, 0x02b91a03};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x00000004, 0x508c0000, 0x80000008, 0x1f7534bf, 0x1617400f, 0x23b86f52, 0x95ff15b8, 0xf9d55f8f,
+      0x20e9c275, 0x7f7aed05, 0x59e3d398, 0x3a9dccd1, 0x0670fb16, 0x5c5f78a7, 0x88d01e07, 0x05723407};
+
+    static constexpr storage<limbs_count> m = {0x151e79ea, 0xf5204c21, 0x8d69e258, 0xfd0a180b,
+                                               0xfaa80548, 0xe4e51e49, 0xc40b2c9e, 0x36d9491e};
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0xfffffff3, 0x7d1c7fff, 0x6ffffff2, 0x7257f50f,
+                                                          0x512c0fee, 0x16d81575, 0x2bbb9a9d, 0x0d4bda32};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x1beeec02, 0x4122dd1a, 0x74fee875, 0xbd1eae95,
+                                                              0x27b28e2f, 0x838557e2, 0x2290c02c, 0x07b30191};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega = {
+      {{0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e},
+       {0x00000001, 0x8f1a4000, 0xb0000001, 0xcf664765, 0x970dec00, 0x23ed1347, 0x00000000, 0x00000000},
+       {0xfbfa0a01, 0x0f830f7e, 0xd75769a0, 0x20f8b46c, 0xf05d5033, 0x7108bd18, 0x0788de01, 0x07405e08},
+       {0x60b9bdae, 0xc78085a6, 0x789094f5, 0x3116ec22, 0xce87d660, 0x0a02a81d, 0xc2a94856, 0x0ead8236},
+       {0x3e83a7cc, 0x6ffc39d9, 0x958a0a74, 0x117d996e, 0x0b92e8c9, 0xc242289d, 0x29d977d6, 0x0484efb4},
+       {0x0111ec3f, 0x15455b00, 0xc5f6be6f, 0x6b62d7af, 0x337f2d07, 0xfcba0365, 0x43fccd26, 0x0f151842},
+       {0xc31ec69b, 0x57951b2e, 0x2a37ce1f, 0x3e0a4be7, 0xcf3b198a, 0x960aeb4a, 0x341fd5cd, 0x04fb0673},
+       {0xa921851f, 0x71c1b78e, 0x7808f239, 0x3c26340c, 0x976fb990, 0xbcc8f69b, 0xe880dc71, 0x06a5edb2},
+       {0xc0f5679e, 0x7619eab5, 0x0dc0b9cd, 0x1f4cd10e, 0xbf6a480a, 0x7e1b70aa, 0x7f5461bb, 0x0ffc66da},
+       {0xec5cbab2, 0x8159806d, 0x498264a3, 0x14ea1333, 0xe3abfaa6, 0x56bbe1d8, 0x02aa031f, 0x09d2b5c4},
+       {0xc010c48a, 0xd2aa9562, 0x3b004b60, 0x447e5c11, 0x11e243bb, 0xd5a21c13, 0x0ab418b1, 0x01eab23e},
+       {0xacff6986, 0x08715ee8, 0xa93924d0, 0xab01878a, 0x6e9ae5c4, 0xbfbc5e71, 0x26b08d6e, 0x0f8000bf},
+       {0x3ddbc679, 0x06bc13b0, 0x615256ce, 0x7269a1f1, 0x1f5221a2, 0xf7716fbf, 0x8c66c14f, 0x0fa1f02c},
+       {0x906f531f, 0xdd40f131, 0x30728eff, 0xb06b29c7, 0x88839294, 0xc891fd19, 0x646978e8, 0x04e88447},
+       {0x6e259cdc, 0xb1e4b769, 0x00514e5e, 0xbcb0b709, 0x05113e7f, 0x74edb7c0, 0xe92e22af, 0x10c88511},
+       {0x240ede5b, 0xebb2e898, 0x42cd84c6, 0xc2639185, 0x9408f956, 0xf79e8391, 0x94e87a7d, 0x06872fa1},
+       {0x260678ff, 0xf8522249, 0xa8de9973, 0x6148cb16, 0x5a4e8d56, 0x5750f3f4, 0xbaeaf0c3, 0x0e805156},
+       {0x3d766f80, 0x1b4b71cf, 0x1069012d, 0x47d21195, 0x9151ebec, 0x5635235f, 0x2b13c808, 0x093f7d91},
+       {0x4637701d, 0x0848f958, 0x4c8353af, 0x8a750076, 0x0ef6174a, 0x485f4e4f, 0xf38db632, 0x078d97a1},
+       {0x66a16869, 0x50c487c1, 0xd1fd4525, 0x380a66ab, 0x265e8539, 0xd455a01a, 0x064b5334, 0x0cd62875},
+       {0x3358eb25, 0xdbc547bc, 0x722037db, 0x8909d398, 0x5e705b6d, 0x8b7075b5, 0x9bdaf407, 0x02694bb2},
+       {0xf45b9621, 0x102fbfb0, 0xf04faac0, 0xe80f4241, 0x7ca61177, 0x0b830bfd, 0x7033169d, 0x10521892},
+       {0xcc943028, 0xed2576ad, 0xfa4c6090, 0x846e49bc, 0x0049d8e6, 0xc74c1865, 0x665d7be5, 0x0e9c5a12},
+       {0xafeb494b, 0x97319dcd, 0x1d78404c, 0xab30c83e, 0xf26ffe90, 0x452d8a48, 0xa36452c7, 0x0bfc2e92},
+       {0xedc626c3, 0xf30e312d, 0xcf1f3a94, 0x8367a7ca, 0x917a1b28, 0x621e15e1, 0xf2e93b82, 0x07cd59f8},
+       {0xf02ba42c, 0x553085d9, 0x1119b10d, 0x59662159, 0x6b8ea03f, 0xaa670958, 0x7ce92983, 0x066f6f5f},
+       {0x4dd87a5e, 0xf423a283, 0xd9a4c364, 0x1fe46601, 0xbfdc7e9b, 0xda4addbf, 0x3bf94b2b, 0x0a7f2bd8},
+       {0xe5f8848a, 0x270a2326, 0xa727567d, 0x97d14afa, 0x48746fc7, 0x1a3a5a4e, 0xa42f077a, 0x0044e4b1},
+       {0x20b7298a, 0xd7652451, 0x65013b06, 0xc7c9a0b7, 0xad0d8457, 0x479b82a9, 0x0c99f5ce, 0x0bef1e5a},
+       {0x1912f7fa, 0x77d7da1d, 0x299fd7d6, 0xbcb7a5b2, 0x142a4480, 0x705e45dd, 0xb492dbd8, 0x0dc835fd},
+       {0xa0234d2d, 0xe943054c, 0xe5f5be5e, 0x673b0ee0, 0x5048a19a, 0xcdd48e41, 0xabc3cb99, 0x0997d277},
+       {0xa9966ac4, 0x1ae0ea67, 0xda83fb3b, 0x4e2dbb1c, 0x0b51380e, 0xf77cf749, 0xb28a7670, 0x048b4b0e},
+       {0xb14361d4, 0x7f1db43f, 0x25ab6d51, 0x7927e578, 0x383bf21e, 0xb43e52a5, 0xd27fa99f, 0x077595e9},
+       {0xa90a2740, 0xfe3ca4f0, 0x512a7c7a, 0xd259ff36, 0xb41fe696, 0xbca3176a, 0xf33132ce, 0x05bd5ea3},
+       {0xf284f768, 0xdeee484b, 0xe26a0475, 0x2a02e015, 0x88d968c2, 0xf0eb4925, 0x82a391c9, 0x0620ce9e},
+       {0xbd83a3da, 0xd3b69b29, 0xe02ce197, 0x9543950f, 0xc2f87783, 0x80799665, 0xc15be215, 0x11ce8199},
+       {0x1b29736e, 0x8f267f19, 0x1d5a0c3a, 0xa2e04d58, 0x1ae99514, 0x76803064, 0x57f7c806, 0x12129439},
+       {0xf32d6bac, 0xa0b973d4, 0xf0d81b72, 0xae951889, 0x2e2daa0a, 0x51dbe098, 0x40d9af8f, 0x04679474},
+       {0x22df9f13, 0x56313de8, 0x599e7536, 0xe2e75200, 0x6d163e50, 0xa1b4fce7, 0xc8111763, 0x0aec2172},
+       {0x355dd694, 0x4258374d, 0x44c76a20, 0x5c31e8ac, 0xaa5fd062, 0x9b473969, 0x1a37b6b4, 0x0a693d77},
+       {0x44ddbbdc, 0xbafb92a6, 0x26b01974, 0x63c7a02d, 0x5f28a274, 0x0ff86e13, 0x867f2e29, 0x0a7b462a},
+       {0xd5fba57b, 0x90684fea, 0xe0defe98, 0xed237883, 0x030ae924, 0xc502b692, 0xe7a1ec2c, 0x08aa58e8},
+       {0x5e9020dd, 0xade9d4b4, 0x87db8813, 0x489259d2, 0x25051238, 0x5ddce740, 0xb5bc4d11, 0x0c775db1},
+       {0x293f8481, 0xd52cc17a, 0x6f133205, 0x041178fb, 0xb2961832, 0xbbc70d18, 0x481760cd, 0x073d34d1},
+       {0xfdacff58, 0x8215b91d, 0x98331645, 0xd8d9177d, 0x439e803c, 0xe85223ad, 0xcca42c1f, 0x04aa8ef0},
+       {0x01ab3a4d, 0x006f60fa, 0x814ba450, 0xe6600e15, 0xdf9eb147, 0xbde4df36, 0x33760d7b, 0x055d58fa},
+       {0xec2a895e, 0x476ef4a4, 0x63e3f04a, 0x9b506ee3, 0xd1a8a12f, 0x60c69477, 0x0cb92cc1, 0x11d4b7f6}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
+      {{0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e},
+       {0x00000000, 0x7af74000, 0x1fffffff, 0x8a442f99, 0xc529c400, 0x3cc739d6, 0x9a2ca556, 0x12ab655e},
+       {0xd60fb046, 0xc9fa190c, 0xc5b4674e, 0xdb5c179b, 0xbc7b8726, 0x2b2bce0b, 0xbf6e69bf, 0x0e4eb338},
+       {0x8ffc4ed5, 0x74732d1f, 0xb7f2eefc, 0x42d9f590, 0xa24dd4dd, 0xf70461e5, 0xef64676f, 0x03b6eba4},
+       {0x102bbab0, 0x5a21f98a, 0x8d8e2efb, 0xa6a147a9, 0x7612906f, 0x0eb4f005, 0x47d8d2e3, 0x0e1a5481},
+       {0xd01e5aa8, 0x6e509add, 0x6e3f123d, 0xe1582468, 0x8274db24, 0xbd6313ee, 0xd173a634, 0x05d5836e},
+       {0xe975c0cf, 0x6aab3344, 0x6f1dc38e, 0xca362e0e, 0x1dd1743a, 0x2fe72cda, 0xc1b4c4c2, 0x0c1c956e},
+       {0xec89a64f, 0x59fe97a0, 0xe8de5d4c, 0x579617d7, 0xc9c1ea7b, 0x256a305b, 0x53fa131b, 0x01ffae4e},
+       {0x29bcb088, 0x463a73ff, 0xe1438e80, 0xee9e9a5e, 0x3c9369e4, 0x2a00951f, 0x80a32052, 0x09711183},
+       {0x4bec8dd2, 0xa36899db, 0x96393687, 0x2946872e, 0x842df3c8, 0xd4b5734f, 0x5f5cd8fb, 0x0834098f},
+       {0xe3c711b9, 0x4bc485f6, 0x648d1d7e, 0xf43a2598, 0xee88abaa, 0x7f981a0e, 0xec6a3f27, 0x0c88c9c3},
+       {0x49046b52, 0x42bcc6c2, 0x56ab9ecc, 0xcc77294a, 0xe4df3ddd, 0x02ecb41a, 0x67f76726, 0x0e567d22},
+       {0x91c64fc2, 0x1cc56cc3, 0xd16a490b, 0x8cb71e65, 0x14fac366, 0x984be37e, 0xa25d7ba5, 0x0a08e032},
+       {0xd4f5941e, 0x966d9739, 0xe5772a73, 0x5805deb6, 0x5c1f970c, 0xe4eb0d33, 0xbdf35409, 0x039715db},
+       {0xcc6518ac, 0x8419686c, 0x9c7a2366, 0x96dec3a8, 0x71724384, 0xefbfcac6, 0xaf34c239, 0x0c44b99a},
+       {0xc18ff4fd, 0xcb66fe1b, 0x86c8d586, 0x588e18b3, 0x1dfab57c, 0xc6e6d2a3, 0x7d7d4efd, 0x10918ad2},
+       {0x97a18f58, 0x56d6cf22, 0xd0d7abd9, 0x11710758, 0x5eb7a9c5, 0xd1a6608b, 0xc4937e38, 0x04059bdb},
+       {0x4b1b63a9, 0x12998cbc, 0xcf420c9f, 0x0f780c6c, 0x129289ad, 0xa5e48723, 0x240a141d, 0x0a3a1223},
+       {0x00db2b48, 0xa43c0e02, 0x933d10ee, 0x76585489, 0xc0ba6a80, 0x12d64af1, 0x2fad8d8e, 0x01940f43},
+       {0x1d75bec9, 0xe29ef6c0, 0xd4b0183b, 0xead287a2, 0xedfd3795, 0x75a017cf, 0x64427c8e, 0x107f8d0f},
+       {0xa26c8c12, 0xa6f4e1d1, 0xf6610f7e, 0x13571553, 0x56701caf, 0xd95e5df6, 0x2263d69d, 0x050e7b89},
+       {0xc161761f, 0x271d7caf, 0xc369a371, 0xf1001d6f, 0x00e60f51, 0x65286415, 0xb74d14b8, 0x00b918f9},
+       {0x03ad3139, 0x01d3f431, 0xa137ce16, 0xe56f6002, 0x1deb42e8, 0x97f53369, 0xaa37cddd, 0x033fa9ac},
+       {0x60cf1330, 0x840f913b, 0x1df5ed87, 0x5610cde6, 0x72b36ddf, 0x858381b0, 0x6f64e0b7, 0x109bf66c},
+       {0x930cee0b, 0x432d3626, 0xf26e8ba3, 0x55ed3efb, 0x14c5457f, 0x802eebcc, 0xe2310f22, 0x00d300e3},
+       {0x4b9ac952, 0x3d29f5ba, 0xc8ea8f94, 0x7c7f2662, 0xcefc3052, 0x736ccb63, 0x0981f3cb, 0x04bfce2f},
+       {0x5d4e643c, 0x3da791ea, 0x85bff013, 0xb6a956ef, 0xd73de6a3, 0x86c629a8, 0x6b8c48a9, 0x0a5a5f55},
+       {0x49c6284a, 0x9ba6aa00, 0xeacbdc63, 0x0b8429fb, 0xedafdf37, 0x9b9c6c5b, 0xad0c78c6, 0x009907e8},
+       {0x3e47b53f, 0x50380ce2, 0x3a9613fc, 0x6ea3c2d3, 0x4c87ab50, 0xfe743105, 0xd192221c, 0x07871979},
+       {0xe978594b, 0x4ddd3320, 0x3abe3f79, 0xe5f36fbe, 0xe4dcff8e, 0x5dba9ef2, 0x7105148f, 0x0bfc27e2},
+       {0x498fb549, 0xd5993cd5, 0x09da9272, 0x718adcee, 0x72bd5bc0, 0x9e03cbb4, 0xc592813f, 0x07206942},
+       {0x78fd3239, 0xaf29730b, 0x40c3e723, 0xbd907ac9, 0x77f214f7, 0x5dcc0aad, 0xb05fb3a1, 0x02d958da},
+       {0xdf80223d, 0x55f432c9, 0x11a2fed9, 0x23daf2f6, 0x41ae8c34, 0x9e43e003, 0x95f22373, 0x0d51533b},
+       {0x7998b62c, 0xbb53132b, 0x22c9b4aa, 0x064a9186, 0x71d61334, 0xd56de253, 0x04e416f6, 0x10fcf25f},
+       {0xdddb58ec, 0x41f8042f, 0x10886d85, 0x7dd54384, 0x622ff4b4, 0x19544f90, 0x050cc539, 0x02f0b49a},
+       {0xa39b02a3, 0x8a3de898, 0xdc94422c, 0x068b2992, 0xf493db31, 0x1c5f019a, 0x11b0f668, 0x066b1790},
+       {0x78500f1a, 0x98310dd7, 0x735ccb27, 0x1c6050bf, 0xb2081df4, 0x07b6fa7f, 0xfa0f1e20, 0x003edf24},
+       {0x89b0ca6f, 0xb4d938e2, 0x2c897570, 0x0214eb59, 0x2d4cf27a, 0x56c45327, 0x3ed546a4, 0x10a2f358},
+       {0xef01ed78, 0xf2828212, 0xf103c9ca, 0xa66094ac, 0x7a2d5573, 0xdceb481d, 0x8af46aab, 0x0190fcde},
+       {0x526bf9fc, 0x023031cc, 0x79c209ba, 0x0e4136c0, 0x3ec42e5c, 0xe5234df1, 0x1d455234, 0x00cb9592},
+       {0x33bf2a1c, 0x842b0c9c, 0xa29b9236, 0x1fd43c95, 0xc06795d3, 0x6b37a603, 0x0c1b712a, 0x00017b17},
+       {0xaf858193, 0x2b955be2, 0x5fb5e378, 0xa513d8be, 0xa326aeb9, 0x88c4ebeb, 0xf3d45990, 0x00c378e2},
+       {0x6464580f, 0x33e6c8c0, 0x3c4aa09f, 0x9d560eb3, 0xcc98f404, 0xb3f1a899, 0x8ca24b48, 0x012c1ea5},
+       {0xe3b4dc56, 0xa0594a67, 0x91b698e1, 0xc8e6b582, 0x8df78057, 0x711cadbf, 0x396466f8, 0x0049abdf},
+       {0x4ffa086a, 0xecc89610, 0xca06afc6, 0x4db82291, 0x8f3a6426, 0x9ae7c68c, 0x2a874432, 0x0b3dae8c},
+       {0x3b3625b6, 0x1e62401f, 0x28471e5a, 0xd0692164, 0x5cad6b77, 0xb85aa9ec, 0xaa95acf2, 0x063e4b66},
+       {0xb9112c51, 0x2542c2b2, 0x6e23b3ce, 0x36ead8da, 0x76476754, 0x9a268d13, 0xa1ad7cf1, 0x121f44ad}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> inv = {
+      {{0x00000001, 0x8508c000, 0x68000000, 0xacd53b7f, 0x2e1bd800, 0x305a268f, 0x4d1652ab, 0x0955b2af},
+       {0x00000001, 0xc78d2000, 0x1c000000, 0x033fd93f, 0xc529c401, 0xc88739d6, 0xf3a17c00, 0x0e008c06},
+       {0x00000001, 0xe8cf5000, 0xf6000000, 0x2e75281e, 0x90b0ba01, 0x949dc37a, 0xc6e710ab, 0x1055f8b2},
+       {0x00000001, 0xf9706800, 0xe3000000, 0x440fcf8e, 0x76743501, 0xfaa9084c, 0xb089db00, 0x1180af08},
+       {0x00000001, 0x01c0f400, 0xd9800001, 0x4edd2346, 0x6955f281, 0xadaeaab5, 0xa55b402b, 0x12160a33},
+       {0x00000001, 0x05e93a00, 0xd4c00001, 0x5443cd22, 0xe2c6d141, 0x07317be9, 0x1fc3f2c1, 0x1260b7c9},
+       {0x00000001, 0x07fd5d00, 0xd2600001, 0x56f72210, 0x1f7f40a1, 0xb3f2e484, 0xdcf84c0b, 0x12860e93},
+       {0x00000001, 0x09076e80, 0xd1300001, 0x5850cc87, 0x3ddb7851, 0x0a5398d1, 0x3b9278b1, 0x1298b9f9},
+       {0x00000001, 0x098c7740, 0x50980001, 0x58fda1c3, 0xcd099429, 0xb583f2f7, 0xeadf8f03, 0x12a20fab},
+       {0x00000001, 0x09cefba0, 0x104c0001, 0x59540c61, 0x14a0a215, 0x0b1c200b, 0x42861a2d, 0x12a6ba85},
+       {0x00000001, 0x09f03dd0, 0xf0260001, 0x597f41af, 0xb86c290b, 0xb5e83694, 0xee595fc1, 0x12a90ff1},
+       {0x00000001, 0x0a00dee8, 0x60130001, 0x5994dc57, 0x8a51ec86, 0x0b4e41d9, 0x4443028c, 0x12aa3aa8},
+       {0x00000001, 0x0a092f74, 0x18098001, 0xd99fa9ab, 0xf344ce43, 0x3601477b, 0x6f37d3f1, 0x12aad003},
+       {0x00000001, 0x0a0d57ba, 0xf404c001, 0x99a51054, 0x27be3f22, 0xcb5aca4d, 0x04b23ca3, 0x12ab1ab1},
+       {0x00000001, 0x0a0f6bdd, 0xe2026001, 0xf9a7c3a9, 0xc1faf791, 0x16078bb5, 0xcf6f70fd, 0x12ab4007},
+       {0x80000001, 0x0a1075ee, 0x59013001, 0xa9a91d54, 0x0f1953c9, 0xbb5dec6a, 0x34ce0b29, 0x12ab52b3},
+       {0x40000001, 0x0a10faf7, 0x94809801, 0x81a9ca29, 0x35a881e5, 0x0e091cc4, 0xe77d5840, 0x12ab5c08},
+       {0xa0000001, 0x0a113d7b, 0x32404c01, 0x6daa2094, 0x48f018f3, 0x375eb4f1, 0xc0d4fecb, 0x12ab60b3},
+       {0xd0000001, 0x0a115ebd, 0x81202601, 0x63aa4bc9, 0xd293e47a, 0xcc098107, 0x2d80d210, 0x12ab6309},
+       {0xe8000001, 0x0a116f5e, 0x28901301, 0xdeaa6164, 0x1765ca3d, 0x965ee713, 0xe3d6bbb3, 0x12ab6433},
+       {0x74000001, 0x0a1177af, 0x7c480981, 0x9c2a6c31, 0xb9cebd1f, 0xfb899a18, 0x3f01b084, 0x12ab64c9},
+       {0xba000001, 0x0a117bd7, 0x262404c1, 0x7aea7198, 0x8b033690, 0xae1ef39b, 0xec972aed, 0x12ab6513},
+       {0xdd000001, 0x0a117deb, 0x7b120261, 0xea4a744b, 0xf39d7348, 0x0769a05c, 0x4361e822, 0x12ab6539},
+       {0xee800001, 0x0a117ef5, 0x25890131, 0x21fa75a5, 0xa7ea91a5, 0x340ef6bd, 0xeec746bc, 0x12ab654b},
+       {0xf7400001, 0x0a117f7a, 0xfac48099, 0x3dd27651, 0x021120d3, 0x4a61a1ee, 0x4479f609, 0x12ab6555},
+       {0x7ba00001, 0x0a117fbd, 0x6562404d, 0x4bbe76a8, 0x2f24686a, 0xd58af786, 0xef534daf, 0x12ab6559},
+       {0xbdd00001, 0x0a117fde, 0x9ab12027, 0xd2b476d3, 0x45ae0c35, 0x1b1fa252, 0x44bff983, 0x12ab655c},
+       {0x5ee80001, 0x0a117fef, 0x35589014, 0x962f76e9, 0x50f2de1b, 0xbde9f7b8, 0x6f764f6c, 0x12ab655d},
+       {0xaf740001, 0x8a117ff7, 0x02ac480a, 0x77ecf6f4, 0x5695470e, 0x8f4f226b, 0x04d17a61, 0x12ab655e},
+       {0xd7ba0001, 0xca117ffb, 0x69562405, 0xe8cbb6f9, 0xd9667b87, 0xf801b7c4, 0x4f7f0fdb, 0x12ab655e},
+       {0xebdd0001, 0x6a117ffd, 0x1cab1203, 0xa13b16fc, 0x9acf15c4, 0x2c5b0271, 0x74d5da99, 0x12ab655e},
+       {0xf5ee8001, 0x3a117ffe, 0x76558902, 0xfd72c6fd, 0xfb8362e2, 0xc687a7c7, 0x87813ff7, 0x12ab655e},
+       {0x7af74001, 0xa2117fff, 0x232ac481, 0x2b8e9efe, 0x2bdd8972, 0x139dfa73, 0x90d6f2a7, 0x12ab655e},
+       {0xbd7ba001, 0x56117fff, 0x79956241, 0xc29c8afe, 0xc40a9cb9, 0xba2923c8, 0x9581cbfe, 0x12ab655e},
+       {0xdebdd001, 0x30117fff, 0xa4cab121, 0x8e2380fe, 0x9021265d, 0x8d6eb873, 0x97d738aa, 0x12ab655e},
+       {0xef5ee801, 0x1d117fff, 0xba655891, 0x73e6fbfe, 0xf62c6b2f, 0x771182c8, 0x9901ef00, 0x12ab655e},
+       {0xf7af7401, 0x13917fff, 0xc532ac49, 0x66c8b97e, 0xa9320d98, 0x6be2e7f3, 0x99974a2b, 0x12ab655e},
+       {0xfbd7ba01, 0x0ed17fff, 0xca995625, 0xe039983e, 0x02b4decc, 0xe64b9a89, 0x99e1f7c0, 0x12ab655e},
+       {0xfdebdd01, 0x0c717fff, 0xcd4cab13, 0x1cf2079e, 0xaf764767, 0xa37ff3d3, 0x9a074e8b, 0x12ab655e},
+       {0xfef5ee81, 0x0b417fff, 0xcea6558a, 0x3b4e3f4e, 0x05d6fbb4, 0x021a2079, 0x9a19f9f1, 0x12ab655e},
+       {0xff7af741, 0x8aa97fff, 0xcf532ac5, 0xca7c5b26, 0xb10755da, 0xb16736cb, 0x9a234fa3, 0x12ab655e},
+       {0xffbd7ba1, 0x4a5d7fff, 0xcfa99563, 0x12136912, 0x069f82ee, 0x090dc1f5, 0x9a27fa7d, 0x12ab655e},
+       {0xffdebdd1, 0x2a377fff, 0xcfd4cab2, 0xb5def008, 0xb16b9977, 0xb4e10789, 0x9a2a4fe9, 0x12ab655e},
+       {0xffef5ee9, 0x9a247fff, 0xcfea6559, 0x87c4b383, 0x06d1a4bc, 0x0acaaa54, 0x9a2b7aa0, 0x12ab655e},
+       {0xfff7af75, 0x521affff, 0x4ff532ad, 0xf0b79541, 0x3184aa5e, 0x35bf7bb9, 0x9a2c0ffb, 0x12ab655e},
+       {0xfffbd7bb, 0x2e163fff, 0x0ffa9957, 0x25310620, 0xc6de2d30, 0xcb39e46b, 0x9a2c5aa8, 0x12ab655e},
+       {0xfffdebde, 0x1c13dfff, 0x6ffd4cac, 0xbf6dbe8f, 0x118aee98, 0x95f718c5, 0x9a2c7fff, 0x12ab655e}}};
+  };
+} // namespace bls12_377
+
+#endif
--- a/icicle/include/fields/snark_fields/bls12_381_base.cuh
+++ b/icicle/include/fields/snark_fields/bls12_381_base.cuh
@@ -0,0 +1,61 @@
+#pragma once
+#ifndef BLS12_381_BASE_PARAMS_H
+#define BLS12_381_BASE_PARAMS_H
+
+#include "fields/storage.cuh"
+
+namespace bls12_381 {
+  struct fq_config {
+    static constexpr unsigned limbs_count = 12;
+    static constexpr unsigned modulus_bit_count = 381;
+    static constexpr unsigned num_of_reductions = 1;
+    static constexpr storage<limbs_count> modulus = {0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe,
+                                                     0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84,
+                                                     0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea};
+    static constexpr storage<limbs_count> modulus_2 = {0xffff5556, 0x73fdffff, 0x62a7ffff, 0x3d57fffd,
+                                                       0xed61ec48, 0xce61a541, 0xe70a257e, 0xc8ee9709,
+                                                       0x869759ae, 0x96374f6c, 0x72ffcd34, 0x340223d4};
+    static constexpr storage<limbs_count> modulus_4 = {0xfffeaaac, 0xe7fbffff, 0xc54ffffe, 0x7aaffffa,
+                                                       0xdac3d890, 0x9cc34a83, 0xce144afd, 0x91dd2e13,
+                                                       0x0d2eb35d, 0x2c6e9ed9, 0xe5ff9a69, 0x680447a8};
+    static constexpr storage<limbs_count> neg_modulus = {0x00005555, 0x46010000, 0x4eac0000, 0xe1540001,
+                                                         0x094f09db, 0x98cf2d5f, 0x0c7aed40, 0x9b88b47b,
+                                                         0xbcb45328, 0xb4e45849, 0xc6801965, 0xe5feee15};
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe, 0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84,
+      0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0x1c718e39, 0x26aa0000, 0x76382eab, 0x7ced6b1d, 0x62113cfd, 0x162c3383, 0x3e71b743, 0x66bf91ed,
+      0x7091a049, 0x292e85a8, 0x86185c7b, 0x1d68619c, 0x0978ef01, 0xf5314933, 0x16ddca6e, 0x50a62cfd,
+      0x349e8bd0, 0x66e59e49, 0x0e7046b4, 0xe2dc90e5, 0xa22f25e9, 0x4bd278ea, 0xb8c35fc7, 0x02a437a4};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0x38e31c72, 0x4d540000, 0xec705d56, 0xf9dad63a, 0xc42279fa, 0x2c586706, 0x7ce36e86, 0xcd7f23da,
+      0xe1234092, 0x525d0b50, 0x0c30b8f6, 0x3ad0c339, 0x12f1de02, 0xea629266, 0x2dbb94dd, 0xa14c59fa,
+      0x693d17a0, 0xcdcb3c92, 0x1ce08d68, 0xc5b921ca, 0x445e4bd3, 0x97a4f1d5, 0x7186bf8e, 0x05486f49};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x71c638e4, 0x9aa80000, 0xd8e0baac, 0xf3b5ac75, 0x8844f3f5, 0x58b0ce0d, 0xf9c6dd0c, 0x9afe47b4,
+      0xc2468125, 0xa4ba16a1, 0x186171ec, 0x75a18672, 0x25e3bc04, 0xd4c524cc, 0x5b7729bb, 0x4298b3f4,
+      0xd27a2f41, 0x9b967924, 0x39c11ad1, 0x8b724394, 0x88bc97a7, 0x2f49e3aa, 0xe30d7f1d, 0x0a90de92};
+    static constexpr storage<limbs_count> m = {0xd59646e8, 0xec4f881f, 0x8163c701, 0x4e65c59e, 0x80a19de7, 0x2f7d1dc7,
+                                               0x7fda82a5, 0xa46e09d0, 0x331e9ae8, 0x38a0406c, 0xcf327917, 0x2760d74b};
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0x0002fffd, 0x76090000, 0xc40c0002, 0xebf4000b,
+                                                          0x53c758ba, 0x5f489857, 0x70525745, 0x77ce5853,
+                                                          0xa256ec6d, 0x5c071a97, 0xfa80e493, 0x15f65ec3};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x380b4820, 0xf4d38259, 0xd898fafb, 0x7fe11274,
+                                                              0x14956dc8, 0x343ea979, 0x58a88de9, 0x1797ab14,
+                                                              0x3c4f538b, 0xed5e6427, 0xe8fb0ce9, 0x14fec701};
+    // nonresidue to generate the extension field
+    static constexpr uint32_t nonresidue = 1;
+    // true if nonresidue is negative
+    static constexpr bool nonresidue_is_negative = true;
+  };
+} // namespace bls12_381
+
+#endif
--- a/icicle/include/fields/snark_fields/bls12_381_scalar.cuh
+++ b/icicle/include/fields/snark_fields/bls12_381_scalar.cuh
@@ -1,8 +1,8 @@
 #pragma once
-#ifndef BLS12_381_PARAMS_H
-#define BLS12_381_PARAMS_H
+#ifndef BLS12_381_SCALAR_PARAMS_H
+#define BLS12_381_SCALAR_PARAMS_H

-#include "utils/storage.cuh"
+#include "fields/storage.cuh"

 namespace bls12_381 {
  struct fp_config {
@@ -145,88 +145,6 @@ namespace bls12_381 {
       {0x00000003, 0x00034801, 0x588313f9, 0x4079f3f8, 0xa32e27f5, 0xdffedd77, 0x41c22ea1, 0x73eda752},
       {0x00000002, 0x0001a400, 0xac40b7fc, 0x4a1bcbfd, 0xd667fffd, 0x099c5abf, 0xb5afd5f5, 0x73eda752}}};
  };
-
-  struct fq_config {
-    static constexpr unsigned limbs_count = 12;
-    static constexpr unsigned modulus_bit_count = 381;
-    static constexpr unsigned num_of_reductions = 1;
-    static constexpr storage<limbs_count> modulus = {0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe,
-                                                     0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84,
-                                                     0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea};
-    static constexpr storage<limbs_count> modulus_2 = {0xffff5556, 0x73fdffff, 0x62a7ffff, 0x3d57fffd,
-                                                       0xed61ec48, 0xce61a541, 0xe70a257e, 0xc8ee9709,
-                                                       0x869759ae, 0x96374f6c, 0x72ffcd34, 0x340223d4};
-    static constexpr storage<limbs_count> modulus_4 = {0xfffeaaac, 0xe7fbffff, 0xc54ffffe, 0x7aaffffa,
-                                                       0xdac3d890, 0x9cc34a83, 0xce144afd, 0x91dd2e13,
-                                                       0x0d2eb35d, 0x2c6e9ed9, 0xe5ff9a69, 0x680447a8};
-    static constexpr storage<limbs_count> neg_modulus = {0x00005555, 0x46010000, 0x4eac0000, 0xe1540001,
-                                                         0x094f09db, 0x98cf2d5f, 0x0c7aed40, 0x9b88b47b,
-                                                         0xbcb45328, 0xb4e45849, 0xc6801965, 0xe5feee15};
-    static constexpr storage<2 * limbs_count> modulus_wide = {
-      0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe, 0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84,
-      0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {
-      0x1c718e39, 0x26aa0000, 0x76382eab, 0x7ced6b1d, 0x62113cfd, 0x162c3383, 0x3e71b743, 0x66bf91ed,
-      0x7091a049, 0x292e85a8, 0x86185c7b, 0x1d68619c, 0x0978ef01, 0xf5314933, 0x16ddca6e, 0x50a62cfd,
-      0x349e8bd0, 0x66e59e49, 0x0e7046b4, 0xe2dc90e5, 0xa22f25e9, 0x4bd278ea, 0xb8c35fc7, 0x02a437a4};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
-      0x38e31c72, 0x4d540000, 0xec705d56, 0xf9dad63a, 0xc42279fa, 0x2c586706, 0x7ce36e86, 0xcd7f23da,
-      0xe1234092, 0x525d0b50, 0x0c30b8f6, 0x3ad0c339, 0x12f1de02, 0xea629266, 0x2dbb94dd, 0xa14c59fa,
-      0x693d17a0, 0xcdcb3c92, 0x1ce08d68, 0xc5b921ca, 0x445e4bd3, 0x97a4f1d5, 0x7186bf8e, 0x05486f49};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x71c638e4, 0x9aa80000, 0xd8e0baac, 0xf3b5ac75, 0x8844f3f5, 0x58b0ce0d, 0xf9c6dd0c, 0x9afe47b4,
-      0xc2468125, 0xa4ba16a1, 0x186171ec, 0x75a18672, 0x25e3bc04, 0xd4c524cc, 0x5b7729bb, 0x4298b3f4,
-      0xd27a2f41, 0x9b967924, 0x39c11ad1, 0x8b724394, 0x88bc97a7, 0x2f49e3aa, 0xe30d7f1d, 0x0a90de92};
-    static constexpr storage<limbs_count> m = {0xd59646e8, 0xec4f881f, 0x8163c701, 0x4e65c59e, 0x80a19de7, 0x2f7d1dc7,
-                                               0x7fda82a5, 0xa46e09d0, 0x331e9ae8, 0x38a0406c, 0xcf327917, 0x2760d74b};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0x0002fffd, 0x76090000, 0xc40c0002, 0xebf4000b,
-                                                          0x53c758ba, 0x5f489857, 0x70525745, 0x77ce5853,
-                                                          0xa256ec6d, 0x5c071a97, 0xfa80e493, 0x15f65ec3};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x380b4820, 0xf4d38259, 0xd898fafb, 0x7fe11274,
-                                                              0x14956dc8, 0x343ea979, 0x58a88de9, 0x1797ab14,
-                                                              0x3c4f538b, 0xed5e6427, 0xe8fb0ce9, 0x14fec701};
-    // i^2, the square of the imaginary unit for the extension field
-    static constexpr uint32_t i_squared = 1;
-    // true if i^2 is negative
-    static constexpr bool i_squared_is_negative = true;
-  };
-
-  // G1 and G2 generators
-  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0xdb22c6bb, 0xfb3af00a, 0xf97a1aef, 0x6c55e83f,
-                                                               0x171bac58, 0xa14e3a3f, 0x9774b905, 0xc3688c4f,
-                                                               0x4fa9ac0f, 0x2695638c, 0x3197d794, 0x17f1d3a7};
-  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x46c5e7e1, 0x0caa2329, 0xa2888ae4, 0xd03cc744,
-                                                               0x2c04b3ed, 0x00db18cb, 0xd5d00af6, 0xfcf5e095,
-                                                               0x741d8ae4, 0xa09e30ed, 0xe3aaa0f1, 0x08b3f481};
-  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0xc121bdb8, 0xd48056c8, 0xa805bbef, 0x0bac0326,
-                                                                  0x7ae3d177, 0xb4510b64, 0xfa403b02, 0xc6e47ad4,
-                                                                  0x2dc51051, 0x26080527, 0xf08f0a91, 0x024aa2b2};
-  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0x5d042b7e, 0xe5ac7d05, 0x13945d57, 0x334cf112,
-                                                                  0xdc7f5049, 0xb5da61bb, 0x9920b61a, 0x596bd0d0,
-                                                                  0x88274f65, 0x7dacd3a0, 0x52719f60, 0x13e02b60};
-  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x08b82801, 0xe1935486, 0x3baca289, 0x923ac9cc,
-                                                                  0x5160d12c, 0x6d429a69, 0x8cbdd3a7, 0xadfd9baa,
-                                                                  0xda2e351a, 0x8cc9cdc6, 0x727d6e11, 0x0ce5d527};
-  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0xf05f79be, 0xaaa9075f, 0x5cec1da1, 0x3f370d27,
-                                                                  0x572e99ab, 0x267492ab, 0x85a763af, 0xcb3e287e,
-                                                                  0x2bc28b99, 0x32acd2b0, 0x2ea734cc, 0x0606c4a0};
-
-  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000004, 0x00000000, 0x00000000, 0x00000000,
-                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
-    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
-    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
 } // namespace bls12_381

 #endif
--- a/icicle/include/fields/snark_fields/bn254_base.cuh
+++ b/icicle/include/fields/snark_fields/bn254_base.cuh
@@ -0,0 +1,49 @@
+#pragma once
+#ifndef BN254_BASE_PARAMS_H
+#define BN254_BASE_PARAMS_H
+
+#include "fields/storage.cuh"
+
+namespace bn254 {
+  struct fq_config {
+    static constexpr unsigned limbs_count = 8;
+    static constexpr unsigned modulus_bit_count = 254;
+    static constexpr unsigned num_of_reductions = 1;
+    static constexpr storage<limbs_count> modulus = {0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91,
+                                                     0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
+    static constexpr storage<limbs_count> modulus_2 = {0xb0f9fa8e, 0x7841182d, 0xd0e3951a, 0x2f02d522,
+                                                       0x0302b0bb, 0x70a08b6d, 0xc2634053, 0x60c89ce5};
+    static constexpr storage<limbs_count> modulus_4 = {0x61f3f51c, 0xf082305b, 0xa1c72a34, 0x5e05aa45,
+                                                       0x06056176, 0xe14116da, 0x84c680a6, 0xc19139cb};
+    static constexpr storage<limbs_count> neg_modulus = {0x278302b9, 0xc3df73e9, 0x978e3572, 0x687e956e,
+                                                         0x7e7ea7a2, 0x47afba49, 0x1ece5fd6, 0xcf9bb18d};
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0x275d69b1, 0x3b5458a2, 0x09eac101, 0xa602072d, 0x6d96cadc, 0x4a50189c, 0x7a1242c8, 0x04689e95,
+      0x34c6b38d, 0x26edfa5c, 0x16375606, 0xb00b8551, 0x0348d21c, 0x599a6f7c, 0x763cbf9c, 0x0925c4b8};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0x4ebad362, 0x76a8b144, 0x13d58202, 0x4c040e5a, 0xdb2d95b9, 0x94a03138, 0xf4248590, 0x08d13d2a,
+      0x698d671a, 0x4ddbf4b8, 0x2c6eac0c, 0x60170aa2, 0x0691a439, 0xb334def8, 0xec797f38, 0x124b8970};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x9d75a6c4, 0xed516288, 0x27ab0404, 0x98081cb4, 0xb65b2b72, 0x29406271, 0xe8490b21, 0x11a27a55,
+      0xd31ace34, 0x9bb7e970, 0x58dd5818, 0xc02e1544, 0x0d234872, 0x6669bdf0, 0xd8f2fe71, 0x249712e1};
+    static constexpr storage<limbs_count> m = {0x19bf90e5, 0x6f3aed8a, 0x67cd4c08, 0xae965e17,
+                                               0x68073013, 0xab074a58, 0x623a04a7, 0x54a47462};
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0x0a78eb28,
+                                                          0x7879462c, 0x666ea36f, 0x9a07df2f, 0x0e0a77c1};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x014afa37, 0xed84884a, 0x0278edf8, 0xeb202285,
+                                                              0xb74492d9, 0xcf63e9cf, 0x59e5c639, 0x2e671571};
+    // nonresidue to generate the extension field
+    static constexpr uint32_t nonresidue = 1;
+    // true if nonresidue is negative
+    static constexpr bool nonresidue_is_negative = true;
+  };
+} // namespace bn254
+
+#endif
--- a/icicle/include/fields/snark_fields/bn254_scalar.cuh
+++ b/icicle/include/fields/snark_fields/bn254_scalar.cuh
@@ -1,8 +1,8 @@
 #pragma once
-#ifndef BN254_PARAMS_H
-#define BN254_PARAMS_H
+#ifndef BN254_SCALAR_PARAMS_H
+#define BN254_SCALAR_PARAMS_H

-#include "utils/storage.cuh"
+#include "fields/storage.cuh"

 namespace bn254 {
  struct fp_config {
@@ -133,67 +133,6 @@ namespace bn254 {
       {0x73c14d83, 0x0cb3e36b, 0x733c6782, 0xf808dca3, 0x7778a18c, 0x921c407f, 0xd4a7d1cd, 0x30644e6c},
       {0xb1e0a6c2, 0xa84aec7f, 0xf67aec09, 0x101e6275, 0xfc7cfcf5, 0xa536431a, 0xdaecb8fb, 0x30644e6f}}};
  };
-
-  struct fq_config {
-    static constexpr unsigned limbs_count = 8;
-    static constexpr unsigned modulus_bit_count = 254;
-    static constexpr unsigned num_of_reductions = 1;
-    static constexpr storage<limbs_count> modulus = {0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91,
-                                                     0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
-    static constexpr storage<limbs_count> modulus_2 = {0xb0f9fa8e, 0x7841182d, 0xd0e3951a, 0x2f02d522,
-                                                       0x0302b0bb, 0x70a08b6d, 0xc2634053, 0x60c89ce5};
-    static constexpr storage<limbs_count> modulus_4 = {0x61f3f51c, 0xf082305b, 0xa1c72a34, 0x5e05aa45,
-                                                       0x06056176, 0xe14116da, 0x84c680a6, 0xc19139cb};
-    static constexpr storage<limbs_count> neg_modulus = {0x278302b9, 0xc3df73e9, 0x978e3572, 0x687e956e,
-                                                         0x7e7ea7a2, 0x47afba49, 0x1ece5fd6, 0xcf9bb18d};
-    static constexpr storage<2 * limbs_count> modulus_wide = {
-      0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {
-      0x275d69b1, 0x3b5458a2, 0x09eac101, 0xa602072d, 0x6d96cadc, 0x4a50189c, 0x7a1242c8, 0x04689e95,
-      0x34c6b38d, 0x26edfa5c, 0x16375606, 0xb00b8551, 0x0348d21c, 0x599a6f7c, 0x763cbf9c, 0x0925c4b8};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
-      0x4ebad362, 0x76a8b144, 0x13d58202, 0x4c040e5a, 0xdb2d95b9, 0x94a03138, 0xf4248590, 0x08d13d2a,
-      0x698d671a, 0x4ddbf4b8, 0x2c6eac0c, 0x60170aa2, 0x0691a439, 0xb334def8, 0xec797f38, 0x124b8970};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x9d75a6c4, 0xed516288, 0x27ab0404, 0x98081cb4, 0xb65b2b72, 0x29406271, 0xe8490b21, 0x11a27a55,
-      0xd31ace34, 0x9bb7e970, 0x58dd5818, 0xc02e1544, 0x0d234872, 0x6669bdf0, 0xd8f2fe71, 0x249712e1};
-    static constexpr storage<limbs_count> m = {0x19bf90e5, 0x6f3aed8a, 0x67cd4c08, 0xae965e17,
-                                               0x68073013, 0xab074a58, 0x623a04a7, 0x54a47462};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0x0a78eb28,
-                                                          0x7879462c, 0x666ea36f, 0x9a07df2f, 0x0e0a77c1};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x014afa37, 0xed84884a, 0x0278edf8, 0xeb202285,
-                                                              0xb74492d9, 0xcf63e9cf, 0x59e5c639, 0x2e671571};
-    // i^2, the square of the imaginary unit for the extension field
-    static constexpr uint32_t i_squared = 1;
-    // true if i^2 is negative
-    static constexpr bool i_squared_is_negative = true;
-  };
-
-  // G1 and G2 generators
-  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                               0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x00000002, 0x00000000, 0x00000000, 0x00000000,
-                                                               0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0xd992f6ed, 0x46debd5c, 0xf75edadd, 0x674322d4,
-                                                                  0x5e5c4479, 0x426a0066, 0x121f1e76, 0x1800deef};
-  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0xaef312c2, 0x97e485b7, 0x35a9e712, 0xf1aa4933,
-                                                                  0x31fb5d25, 0x7260bfb7, 0x920d483a, 0x198e9393};
-  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x66fa7daa, 0x4ce6cc01, 0x0c43d37b, 0xe3d1e769,
-                                                                  0x8dcb408f, 0x4aab7180, 0xdb8c6deb, 0x12c85ea5};
-  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0xd122975b, 0x55acdadc, 0x70b38ef3, 0xbc4b3133,
-                                                                  0x690c3395, 0xec9e99ad, 0x585ff075, 0x090689d0};
-
-  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000003, 0x00000000, 0x00000000, 0x00000000,
-                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
-    0x24a138e5, 0x3267e6dc, 0x59dbefa3, 0xb5b4c5e5, 0x1be06ac3, 0x81be1899, 0xceb8aaae, 0x2b149d40};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
-    0x85c315d2, 0xe4a2bd06, 0xe52d1852, 0xa74fa084, 0xeed8fdf4, 0xcd2cafad, 0x3af0fed4, 0x009713b0};
 } // namespace bn254

 #endif
--- a/icicle/include/fields/snark_fields/bw6_761_base.cuh
+++ b/icicle/include/fields/snark_fields/bw6_761_base.cuh
@@ -1,13 +1,10 @@
 #pragma once
-#ifndef BW6_761_PARAMS_H
-#define BW6_761_PARAMS_H
+#ifndef BW6_761_BASE_BASE_H
+#define BW6_761_BASE_BASE_H

-#include "utils/storage.cuh"
-#include "bls12_377_params.cuh"
+#include "fields/storage.cuh"

 namespace bw6_761 {
-  typedef bls12_377::fq_config fp_config;
-
  struct fq_config {
    static constexpr unsigned limbs_count = 24;
    static constexpr unsigned modulus_bit_count = 761;
@@ -77,33 +74,6 @@ namespace bw6_761 {
      0x7695ef18, 0x5e763565, 0x4fae56bb, 0x226022c2, 0xb70d7652, 0x80e7f067, 0x72116b89, 0x435a8b4a,
      0x5d84e0d4, 0xac258fd6, 0x4427c7b2, 0x47ee8ac5, 0xd04e621b, 0x478c4048, 0x2add3e93, 0x00e0aa7d};
  };
-
-  // G1 and G2 generators
-  static constexpr storage<fq_config::limbs_count> g1_gen_x = {
-    0x66e5b43d, 0x4088f3af, 0xa6af603f, 0x055928ac, 0x56133e82, 0x6750dd03, 0x280ca27f, 0x03758f9a,
-    0xc9ea0971, 0x5bd71fa0, 0x47729b90, 0xa17a54ce, 0x94c2e746, 0x11dbfcd2, 0xc15520ac, 0x79017ffa,
-    0x85f56fc7, 0xee05c54b, 0x551b27f0, 0xe6a0cfb7, 0xa477beae, 0xb277ce98, 0x0ea190c8, 0x01075b02};
-  static constexpr storage<fq_config::limbs_count> g1_gen_y = {
-    0xb4e95363, 0xbafc8f2d, 0x0b20d2a1, 0xad1cb2be, 0xcad0fb93, 0xb2b08119, 0xb3053253, 0x9f9df141,
-    0x6fc2cdd4, 0xbe3fb90b, 0x717a4c55, 0xcc685d31, 0x71b5b806, 0xc5b8fa17, 0xaf7e0dba, 0x265909f1,
-    0xa2e573a3, 0x1a7348d2, 0x884c9ec6, 0x0f952589, 0x45cc2a42, 0xe6fd637b, 0x0a6fc574, 0x0058b84e};
-  static constexpr storage<fq_config::limbs_count> g2_gen_x = {
-    0xcd025f1c, 0xa830c194, 0xe1bf995b, 0x6410cf4f, 0xc2ad54b0, 0x00e96efb, 0x3cd208d7, 0xce6948cb,
-    0x00e1b6ba, 0x963317a3, 0xac70e7c7, 0xc5bbcae9, 0xf09feb58, 0x734ec3f1, 0xab3da268, 0x26b41c5d,
-    0x13890f6d, 0x4c062010, 0xc5a7115f, 0xd61053aa, 0x69d660f9, 0xc852a82e, 0x41d9b816, 0x01101332};
-  static constexpr storage<fq_config::limbs_count> g2_gen_y = {
-    0x28c73b61, 0xeb70a167, 0xf9eac689, 0x91ec0594, 0x3c5a02a5, 0x58aa2d3a, 0x504affc7, 0x3ea96fcd,
-    0xffa82300, 0x8906c170, 0xd2c712b8, 0x64f293db, 0x33293fef, 0x94c97eb7, 0x0b95a59c, 0x0a1d86c8,
-    0x53ffe316, 0x81a78e27, 0xcec2181c, 0x26b7cf9a, 0xe4b6d2dc, 0x8179eb10, 0x7761369f, 0x0017c335};
-
-  static constexpr storage<fq_config::limbs_count> weierstrass_b = {
-    0x0000008a, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2,
-    0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638,
-    0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824};
-  static constexpr storage<fq_config::limbs_count> g2_weierstrass_b = {
-    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
 } // namespace bw6_761

-#endif
+#endif
--- a/icicle/include/fields/stark_fields/baby_bear.cuh
+++ b/icicle/include/fields/stark_fields/baby_bear.cuh
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "fields/storage.cuh"
+#include "fields/field.cuh"
+
+namespace baby_bear {
+  struct fp_config {
+    static constexpr unsigned limbs_count = 1;
+    static constexpr unsigned omegas_count = 28;
+    static constexpr unsigned modulus_bit_count = 31;
+    static constexpr unsigned num_of_reductions = 1;
+
+    static constexpr storage<limbs_count> modulus = {0x78000001};
+    static constexpr storage<limbs_count> modulus_2 = {0xf0000002};
+    static constexpr storage<limbs_count> modulus_4 = {0x00000000};
+    static constexpr storage<limbs_count> neg_modulus = {0x87ffffff};
+    static constexpr storage<2 * limbs_count> modulus_wide = {0x78000001, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {0xf0000001, 0x38400000};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {0xe0000002, 0x70800001};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {0xc0000004, 0xe1000003};
+
+    static constexpr storage<limbs_count> m = {0x88888887};
+    static constexpr storage<limbs_count> one = {0x00000001};
+    static constexpr storage<limbs_count> zero = {0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0xffffffe};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x38400000};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega = {
+      {{0x78000000}, {0x10faa3e0}, {0x6b615c47}, {0x21ceed5a}, {0x2c1c3348}, {0x36c54c86}, {0x701dd01c},
+       {0x56a9a28e}, {0x03e4cabf}, {0x5bacde79}, {0x1eb53838}, {0x1cd781af}, {0x0961a0b7}, {0x65098a87},
+       {0x77851a0b}, {0x5bcba331}, {0x053fc0f5}, {0x5bf816e5}, {0x4bb124ab}, {0x571e9d4e}, {0x313732cb},
+       {0x28aca172}, {0x4e319b52}, {0x45692d95}, {0x14ff4ba1}, {0x00004951}, {0x00000089}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
+      {{0x78000000}, {0x67055c21}, {0x5ee99486}, {0x0bb4c4e4}, {0x4ab33b27}, {0x044b4497}, {0x410e23aa},
+       {0x08a7ee2b}, {0x563cb93d}, {0x3d70b4b7}, {0x77d999f1}, {0x6ceb65b5}, {0x49e7f635}, {0x0eae3a8c},
+       {0x238b8a78}, {0x70d71b0a}, {0x0eaacc45}, {0x5af0f193}, {0x47303308}, {0x573cbfad}, {0x29ff72c0},
+       {0x05af9dac}, {0x00ef24df}, {0x26985530}, {0x22d1ce4b}, {0x08359375}, {0x2cabe994}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> inv = {
+      {{0x3c000001}, {0x5a000001}, {0x69000001}, {0x70800001}, {0x74400001}, {0x76200001}, {0x77100001},
+       {0x77880001}, {0x77c40001}, {0x77e20001}, {0x77f10001}, {0x77f88001}, {0x77fc4001}, {0x77fe2001},
+       {0x77ff1001}, {0x77ff8801}, {0x77ffc401}, {0x77ffe201}, {0x77fff101}, {0x77fff881}, {0x77fffc41},
+       {0x77fffe21}, {0x77ffff11}, {0x77ffff89}, {0x77ffffc5}, {0x77ffffe3}, {0x77fffff2}}};
+
+    // nonresidue to generate the extension field
+    static constexpr uint32_t nonresidue = 11;
+    // true if nonresidue is negative.
+    // TODO: we're very confused by plonky3 and risc0 having different nonresidues: 11 and -11 respectively
+    static constexpr bool nonresidue_is_negative = true;
+  };
+} // namespace baby_bear
--- a/icicle/include/fields/storage.cuh
+++ b/icicle/include/fields/storage.cuh
--- a/icicle/include/gpu-utils/device_context.cu
+++ b/icicle/include/gpu-utils/device_context.cu
--- a/icicle/include/gpu-utils/device_context.cuh
+++ b/icicle/include/gpu-utils/device_context.cuh
--- a/icicle/include/gpu-utils/error_handler.cuh
+++ b/icicle/include/gpu-utils/error_handler.cuh
--- a/icicle/include/gpu-utils/modifiers.cuh
+++ b/icicle/include/gpu-utils/modifiers.cuh
--- a/icicle/include/gpu-utils/sharedmem.cuh
+++ b/icicle/include/gpu-utils/sharedmem.cuh
@@ -58,8 +58,6 @@
 #ifndef _SHAREDMEM_H_
 #define _SHAREDMEM_H_

-#include "curves/curve_config.cuh"
-
 /** @brief Wrapper class for templatized dynamic shared memory arrays.
 *
 * This struct uses template specialization on the type \a T to declare
@@ -77,8 +75,8 @@ struct SharedMemory {
  //! @returns Pointer to runtime-sized shared memory array
  __device__ T* getPointer()
  {
-    extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
-    Error_UnsupportedType();
+    // extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
+    // Error_UnsupportedType();
    return (T*)0;
  }
  // TODO: Use operator overloading to make this class look like a regular array
@@ -214,24 +212,6 @@ struct SharedMemory<uchar4> {
  }
 };

-template <>
-struct SharedMemory<curve_config::scalar_t> {
-  __device__ curve_config::scalar_t* getPointer()
-  {
-    extern __shared__ curve_config::scalar_t s_scalar_[];
-    return s_scalar_;
-  }
-};
-
-template <>
-struct SharedMemory<curve_config::projective_t> {
-  __device__ curve_config::projective_t* getPointer()
-  {
-    extern __shared__ curve_config::projective_t s_projective_[];
-    return s_projective_;
-  }
-};
-
 #endif //_SHAREDMEM_H_

 // Leave this at the end of the file
--- a/icicle/include/hash/keccak/keccak.cuh
+++ b/icicle/include/hash/keccak/keccak.cuh
@@ -3,8 +3,8 @@
 #define KECCAK_H

 #include <cstdint>
-#include "utils/device_context.cuh"
-#include "utils/error_handler.cuh"
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"

 namespace keccak {
  /**
--- a/icicle/appUtils/msm/msm.cuh
+++ b/icicle/appUtils/msm/msm.cuh
@@ -4,12 +4,11 @@

 #include <cuda_runtime.h>

-#include "curves/curve_config.cuh"
-#include "primitives/affine.cuh"
-#include "primitives/field.cuh"
-#include "primitives/projective.cuh"
-#include "utils/device_context.cuh"
-#include "utils/error_handler.cuh"
+#include "curves/affine.cuh"
+#include "curves/projective.cuh"
+#include "fields/field.cuh"
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"

 /**
 * @namespace msm
@@ -87,7 +86,7 @@ namespace msm {
   * @return Default value of [MSMConfig](@ref MSMConfig).
   */
  template <typename A>
-  MSMConfig DefaultMSMConfig();
+  MSMConfig DefaultMSMConfig(const device_context::DeviceContext& ctx = device_context::get_default_device_context());

  /**
   * A function that computes MSM: \f$ MSM(s_i, P_i) = \sum_{i=1}^N s_i \cdot P_i \f$.
--- a/icicle/appUtils/ntt/ntt.cuh
+++ b/icicle/appUtils/ntt/ntt.cuh
@@ -4,10 +4,9 @@

 #include <cuda_runtime.h>

-#include "curves/curve_config.cuh"
-#include "utils/device_context.cuh"
-#include "utils/error_handler.cuh"
-#include "utils/sharedmem.cuh"
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
+#include "gpu-utils/sharedmem.cuh"
 #include "utils/utils_kernels.cuh"
 #include "utils/utils.h"

@@ -56,6 +55,15 @@ namespace ntt {
  template <typename S>
  cudaError_t ReleaseDomain(device_context::DeviceContext& ctx);

+  /* Returns the basic root of unity Wn corresponding to the basic root used to initialize the domain.
+   * Useful when computing NTT on cosets. In that case we must use the root W_2n that is between W_n and W_n+1.
+   * @param logn log size of the required root.
+   * @param ctx Details related to the device such as its id and stream id.
+   * @return Wn root of unity corresponding to logn and the basic root used for initDomain(root)
+   */
+  template <typename S>
+  S GetRootOfUnity(uint64_t logn, device_context::DeviceContext& ctx);
+
  /**
   * @enum NTTDir
   * Whether to perform normal forward NTT, or inverse NTT (iNTT). Mathematically, forward NTT computes polynomial
@@ -130,7 +138,8 @@ namespace ntt {
   * @return Default value of [NTTConfig](@ref NTTConfig).
   */
  template <typename S>
-  NTTConfig<S> DefaultNTTConfig();
+  NTTConfig<S>
+  DefaultNTTConfig(const device_context::DeviceContext& ctx = device_context::get_default_device_context());

  /**
   * A function that computes NTT or iNTT in-place. It's necessary to call [InitDomain](@ref InitDomain) with an
--- a/icicle/appUtils/ntt/ntt_impl.cuh
+++ b/icicle/appUtils/ntt/ntt_impl.cuh
@@ -3,7 +3,7 @@
 #define _NTT_IMPL_H

 #include <stdint.h>
-#include "appUtils/ntt/ntt.cuh" // for enum Ordering
+#include "ntt/ntt.cuh" // for enum Ordering

 namespace mxntt {

--- a/icicle/include/polynomials/cuda_backend/polynomial_cuda_backend.cuh
+++ b/icicle/include/polynomials/cuda_backend/polynomial_cuda_backend.cuh
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "gpu-utils/device_context.cuh"
+#include "fields/field_config.cuh"
+#include "polynomials/polynomials.h"
+
+using device_context::DeviceContext;
+
+namespace polynomials {
+  template <typename C = scalar_t, typename D = C, typename I = C>
+  class CUDAPolynomialFactory : public AbstractPolynomialFactory<C, D, I>
+  {
+    std::vector<DeviceContext> m_device_contexts; // device-id --> device context
+    std::vector<cudaStream_t> m_device_streams;   // device-id --> device stream. Storing the streams here as workaround
+                                                  // since DeviceContext has a reference to a stream.
+
+  public:
+    CUDAPolynomialFactory();
+    ~CUDAPolynomialFactory();
+    std::shared_ptr<IPolynomialContext<C, D, I>> create_context() override;
+    std::shared_ptr<IPolynomialBackend<C, D, I>> create_backend() override;
+  };
+} // namespace polynomials
--- a/icicle/include/polynomials/polynomial_abstract_factory.h
+++ b/icicle/include/polynomials/polynomial_abstract_factory.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "polynomial_context.h"
+#include "polynomial_backend.h"
+#include <memory> // For std::shared_ptr
+
+namespace polynomials {
+
+  /**
+   * @brief Abstract factory for creating polynomial contexts and backends.
+   *
+   * The `AbstractPolynomialFactory` serves as an interface for factories capable of creating
+   * instances of `IPolynomialContext` and `IPolynomialBackend`. This design allows for the
+   * decoupling of object creation from their usage, facilitating the implementation of various
+   * computational strategies (e.g., GPU, ZPU) without altering client code. Each concrete factory
+   * is expected to provide tailored implementations of polynomial contexts and backends that
+   * are optimized for specific computational environments.
+   *
+   * @tparam C Type of the coefficients.
+   * @tparam D Domain type, representing the input space of the polynomial.
+   * @tparam I Image type, representing the output space of the polynomial.
+   */
+  template <typename C, typename D, typename I>
+  class AbstractPolynomialFactory
+  {
+  public:
+    /**
+     * @brief Creates and returns a shared pointer to an `IPolynomialContext` instance.
+     *
+     * @return std::shared_ptr<IPolynomialContext<C, D, I>> A shared pointer to the created
+     *         polynomial context instance.
+     */
+    virtual std::shared_ptr<IPolynomialContext<C, D, I>> create_context() = 0;
+
+    /**
+     * @brief Creates and returns a shared pointer to an `IPolynomialBackend` instance.
+     *
+     * @return std::shared_ptr<IPolynomialBackend<C, D, I>> A shared pointer to the created
+     *         polynomial backend instance.
+     */
+    virtual std::shared_ptr<IPolynomialBackend<C, D, I>> create_backend() = 0;
+
+    /**
+     * @brief Virtual destructor for the `AbstractPolynomialFactory`.
+     */
+    virtual ~AbstractPolynomialFactory() = default;
+  };
+
+} // namespace polynomials
--- a/icicle/include/polynomials/polynomial_backend.h
+++ b/icicle/include/polynomials/polynomial_backend.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <cstdint> // for uint64_t, int64_t
+
+namespace polynomials {
+
+  /**
+   * @brief Interface for the polynomial computational backend.
+   *
+   * The `IPolynomialBackend` interface defines the set of operations for polynomial arithmetic
+   * and manipulation that can be performed on a given computational device or platform (e.g., GPU, ZPU).
+   * This interface abstracts the computational logic, allowing for implementation-specific optimizations
+   * and hardware utilization. It interacts closely with `IPolynomialContext` to manage polynomial data
+   * states and perform computations.
+   *
+   * @tparam C Type of the coefficients.
+   * @tparam D Domain type, representing the input space of the polynomial.
+   * @tparam I Image type, representing the output space of the polynomial.
+   */
+  template <typename C, typename D, typename I>
+  class IPolynomialBackend
+  {
+  public:
+    IPolynomialBackend() = default;
+    virtual ~IPolynomialBackend() = default;
+
+    typedef std::shared_ptr<IPolynomialContext<C, D, I>> PolyContext;
+
+    // Initialization methods
+    virtual void from_coefficients(PolyContext p, uint64_t nof_coefficients, const C* coefficients = nullptr) = 0;
+    virtual void from_rou_evaluations(PolyContext p, uint64_t nof_evaluations, const I* evaluations = nullptr) = 0;
+    virtual void clone(PolyContext out, PolyContext in) = 0;
+
+    // Arithmetic operations
+    virtual void add(PolyContext& out, PolyContext op_a, PolyContext op_b) = 0;
+    virtual void subtract(PolyContext out, PolyContext op_a, PolyContext op_b) = 0;
+    virtual void multiply(PolyContext out, PolyContext op_a, PolyContext op_b) = 0;
+    virtual void multiply(PolyContext out, PolyContext p, D scalar) = 0; // scalar multiplication
+    virtual void divide(PolyContext Quotient_out, PolyContext Remainder_out, PolyContext op_a, PolyContext op_b) = 0;
+    virtual void quotient(PolyContext out, PolyContext op_a, PolyContext op_b) = 0;
+    virtual void remainder(PolyContext out, PolyContext op_a, PolyContext op_b) = 0;
+    virtual void divide_by_vanishing_polynomial(PolyContext out, PolyContext op_a, uint64_t vanishing_poly_degree) = 0;
+
+    // Operations specific to monomials
+    virtual void add_monomial_inplace(PolyContext& poly, C monomial_coeff, uint64_t monomial) = 0;
+    virtual void sub_monomial_inplace(PolyContext& poly, C monomial_coeff, uint64_t monomial) = 0;
+
+    // Utility methods
+    virtual void slice(PolyContext out, PolyContext in, uint64_t offset, uint64_t stride, uint64_t size) = 0;
+    virtual int64_t degree(PolyContext op) = 0;
+
+    // Method to access mutable storage within the context
+    void* get_context_storage_mutable(PolyContext ctxt) { return ctxt->get_storage_mutable(); }
+    const void* get_context_storage_immutable(PolyContext ctxt) { return ctxt->get_storage_immutable(); }
+
+    // Evaluation methods
+    virtual I evaluate(PolyContext op, const D& domain_x) = 0;
+    virtual void evaluate_on_domain(PolyContext op, const D* domain, uint64_t size, I* evaluations /*OUT*/) = 0;
+
+    // Methods to copy coefficients to host memory
+    virtual C copy_coefficient_to_host(PolyContext op, uint64_t coeff_idx) = 0;
+    virtual int64_t
+    copy_coefficients_to_host(PolyContext op, C* host_coeffs, int64_t start_idx = 0, int64_t end_idx = -1) = 0;
+
+    // Methods to get views of coefficients and evaluations, including device id
+    virtual std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/>
+    get_coefficients_view(PolyContext p) = 0;
+    virtual std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
+    get_rou_evaluations_view(PolyContext p, uint64_t nof_evaluations = 0, bool is_reversed = false) = 0;
+  };
+
+} // namespace polynomials
--- a/icicle/include/polynomials/polynomial_context.h
+++ b/icicle/include/polynomials/polynomial_context.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <utility>   // for std::pair
+#include <tuple>     // for std::tuple
+#include <iostream>  // for std::ostream
+#include <algorithm> // for std::max
+#include <cstdint>   // for uint64_t, etc.
+#include <memory>
+#include "utils/integrity_pointer.h"
+
+namespace polynomials {
+
+  template <typename Coeff, typename Domain, typename Image>
+  class IPolynomialBackend;
+
+  /**
+   * @brief Interface for polynomial context, encapsulating state, memory, and device context.
+   *
+   * This interface is designed to manage the state of polynomials including their coefficients and
+   * evaluations in both natural and reversed order. It supports operations for converting between
+   * these forms, allocating and releasing resources, and accessing the underlying data. The context
+   * abstracts over the specifics of memory and execution context, allowing polynomials to be managed
+   * in a way that is agnostic to the underlying hardware or software infrastructure.
+   *
+   * @tparam C Type of the coefficients.
+   * @tparam D Domain type, representing the input space of the polynomial.
+   * @tparam I Image type, representing the output space of the polynomial.
+   */
+  template <typename C, typename D, typename I>
+  class IPolynomialContext
+  {
+  public:
+    friend class IPolynomialBackend<C, D, I>;
+
+    // Enumerates the possible states of a polynomial context.
+    enum State { Invalid, Coefficients, EvaluationsOnRou_Natural, EvaluationsOnRou_Reversed };
+
+    // The size of the largest element among coefficients and evaluations.
+    static constexpr size_t ElementSize = std::max(sizeof(C), sizeof(I));
+
+    /**
+     * @brief Construct a new IPolynomialContext object.
+     */
+    IPolynomialContext() : m_id{s_id++} {}
+
+    /**
+     * @brief Virtual destructor for IPolynomialContext.
+     */
+    virtual ~IPolynomialContext() = default;
+
+    // Methods for initializing the context from coefficients or evaluations.
+    virtual void from_coefficients(uint64_t nof_coefficients, const C* coefficients = nullptr) = 0;
+    virtual void from_rou_evaluations(uint64_t nof_evaluations, const I* evaluations = nullptr) = 0;
+
+    // Method for cloning the context from another instance.
+    virtual void clone(IPolynomialContext& from) = 0;
+
+    // Methods for resource management.
+    virtual void allocate(uint64_t nof_elements, State init_state = State::Coefficients, bool memset_zeros = true) = 0;
+    virtual void release() = 0;
+
+    // Methods for transforming between coefficients and evaluations.
+    virtual void transform_to_coefficients(uint64_t nof_coefficients = 0) = 0;
+    virtual void transform_to_evaluations(uint64_t nof_evaluations = 0, bool is_reversed = false) = 0;
+
+    // Accessors for the state and number of elements.
+    virtual State get_state() const = 0;
+    virtual uint64_t get_nof_elements() const = 0;
+
+    // Methods to get direct access to coefficients and evaluations.
+    virtual std::pair<const C*, uint64_t> get_coefficients() = 0;
+    virtual std::pair<const I*, uint64_t> get_rou_evaluations() = 0;
+
+    // Methods to get views of coefficients and evaluations, including device id.
+    virtual std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view() = 0;
+    virtual std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
+    get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false) = 0;
+
+    // Method for printing the context state to an output stream.
+    virtual void print(std::ostream& os) = 0;
+
+  protected:
+    // Provides mutable access to the underlying storage for backend computations.
+    virtual void* get_storage_mutable() = 0;
+    virtual const void* get_storage_immutable() = 0;
+
+    // Static and instance variables for debug id management.
+    static inline uint64_t s_id = 0; // Global id counter.
+
+  public:
+    const uint64_t m_id;
+  };
+} // namespace polynomials
--- a/icicle/include/polynomials/polynomials.h
+++ b/icicle/include/polynomials/polynomials.h
@@ -0,0 +1,133 @@
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include "utils/integrity_pointer.h"
+#include "fields/field_config.cuh"
+
+#include "polynomial_context.h"
+#include "polynomial_backend.h"
+#include "polynomial_abstract_factory.h"
+
+using namespace field_config;
+
+namespace polynomials {
+
+  /**
+   * @brief Represents a polynomial and provides operations for polynomial arithmetic, evaluation, and manipulation.
+   *
+   * This class models a polynomial with coefficients of type `Coeff`, defined over a domain `Domain` and producing
+   * outputs of type `Image`. It supports a range of operations including basic arithmetic (addition, subtraction,
+   * multiplication, division), evaluation at points or over domains, and manipulation (slicing, adding monomials).
+   * The implementation abstracts over the specifics of computation and storage through the use of an abstract factory,
+   * contexts, and backends, allowing for efficient execution across various computational environments.
+   *
+   * @tparam Coeff Type of the coefficients of the polynomial.
+   * @tparam Domain Type representing the input space of the polynomial (defaults to `Coeff`).
+   * @tparam Image Type representing the output space of the polynomial (defaults to `Coeff`).
+   */
+  template <typename Coeff, typename Domain = Coeff, typename Image = Coeff>
+  class Polynomial
+  {
+  public:
+    // Initialization (coefficients/evaluations can reside on host or device)
+    static Polynomial from_coefficients(const Coeff* coefficients, uint64_t nof_coefficients);
+    static Polynomial from_rou_evaluations(const Image* evaluations, uint64_t nof_evaluations);
+
+    // Clone the polynomial
+    Polynomial clone() const;
+
+    // Arithmetic ops
+    Polynomial operator+(const Polynomial& rhs) const;
+    Polynomial& operator+=(const Polynomial& rhs);
+
+    Polynomial operator-(const Polynomial& rhs) const;
+
+    Polynomial operator*(const Polynomial& rhs) const;
+    Polynomial operator*(const Domain& scalar) const; // scalar multiplication
+    template <typename C, typename D, typename I>
+    friend Polynomial<C, D, I> operator*(const D& scalar, const Polynomial<C, D, I>& rhs);
+
+    std::pair<Polynomial, Polynomial> divide(const Polynomial& rhs) const; //  returns (Q(x), R(x))
+    Polynomial operator/(const Polynomial& rhs) const; // returns Quotient Q(x) for A(x) = Q(x)B(x) + R(x)
+    Polynomial operator%(const Polynomial& rhs) const; // returns Remainder R(x) for A(x) = Q(x)B(x) + R(x)
+    Polynomial divide_by_vanishing_polynomial(uint64_t degree) const;
+
+    // arithmetic ops with monomial
+    Polynomial& add_monomial_inplace(Coeff monomial_coeff, uint64_t monomial = 0);
+    Polynomial& sub_monomial_inplace(Coeff monomial_coeff, uint64_t monomial = 0);
+
+    // Slicing and selecting even or odd components.
+    Polynomial slice(uint64_t offset, uint64_t stride, uint64_t size = 0 /*0 means take all elements*/);
+    Polynomial even();
+    Polynomial odd();
+
+    // Note: Following ops cannot be traced. Calling them invokes polynomial evaluation
+
+    // Evaluation methods
+    Image operator()(const Domain& x) const;
+    Image evaluate(const Domain& x) const;
+    void evaluate_on_domain(Domain* domain, uint64_t size, Image* evals /*OUT*/) const; // caller allocates memory
+
+    // Method to obtain the degree of the polynomial
+    int64_t degree();
+
+    // Methods for copying coefficients to host memory.
+    Coeff copy_coefficient_to_host(uint64_t idx) const; // single coefficient
+    // caller is allocating output memory. If coeff==nullptr, returning nof_coeff only
+    int64_t copy_coefficients_to_host(Coeff* host_coeffs = nullptr, int64_t start_idx = 0, int64_t end_idx = -1) const;
+
+    // Methods for obtaining a view of the coefficients or evaluations
+    std::tuple<IntegrityPointer<Coeff>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view();
+    std::tuple<IntegrityPointer<Image>, uint64_t /*size*/, uint64_t /*device_id*/>
+    get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false);
+
+    // Overload stream insertion operator for printing.
+    friend std::ostream& operator<<(std::ostream& os, Polynomial& poly)
+    {
+      poly.m_context->print(os);
+      return os;
+    }
+
+    // Static method to initialize the polynomial class with a factory for context and backend creation.
+    static void initialize(std::shared_ptr<AbstractPolynomialFactory<Coeff, Domain, Image>> factory)
+    {
+      std::atexit(cleanup);
+      s_factory = factory;
+    }
+
+    // Cleanup method for releasing factory resources.
+    static void cleanup() { s_factory = nullptr; }
+
+  private:
+    // The context of the polynomial, encapsulating its state.
+    std::shared_ptr<IPolynomialContext<Coeff, Domain, Image>> m_context = nullptr;
+    // The computational backend for the polynomial operations.
+    std::shared_ptr<IPolynomialBackend<Coeff, Domain, Image>> m_backend = nullptr;
+
+    // Factory for constructing the context and backend instances.
+    static inline std::shared_ptr<AbstractPolynomialFactory<Coeff, Domain, Image>> s_factory = nullptr;
+
+  public:
+    Polynomial();
+    ~Polynomial() = default;
+
+    // Ensures polynomials can be moved but not copied, to manage resources efficiently.
+    Polynomial(Polynomial&&) = default;
+    Polynomial& operator=(Polynomial&&) = default;
+    Polynomial(const Polynomial&) = delete;
+    Polynomial& operator=(const Polynomial&) = delete;
+
+    std::shared_ptr<IPolynomialContext<Coeff, Domain, Image>> get_context() { return m_context; }
+  };
+
+  // explicit instantiation
+
+  // Friend operator to allow multiplication with a scalar from the left-hand side
+  template <typename C = scalar_t, typename D = C, typename I = C>
+  Polynomial<C, D, I> operator*(const D& scalar, const Polynomial<C, D, I>& rhs);
+
+  // External template instantiation to ensure the template is compiled for specific types.
+  extern template class Polynomial<scalar_t>;
+
+} // namespace polynomials
--- a/icicle/include/polynomials/polynomials_c_api.h
+++ b/icicle/include/polynomials/polynomials_c_api.h
@@ -0,0 +1,247 @@
+#pragma once
+
+#include "polynomials.h"
+#include "fields/field_config.cuh"
+#include "utils/utils.h"
+#include "utils/integrity_pointer.h"
+
+namespace polynomials {
+  extern "C" {
+
+  // Defines a polynomial instance based on the scalar type from the FIELD configuration.
+  typedef Polynomial<scalar_t> PolynomialInst;
+
+  // Constructs a polynomial from a set of coefficients.
+  // coeffs: Array of coefficients.
+  // size: Number of coefficients in the array.
+  // Returns a pointer to the newly created polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_create_from_coefficients)(scalar_t* coeffs, size_t size)
+  {
+    auto result = new PolynomialInst(PolynomialInst::from_coefficients(coeffs, size));
+    return result;
+  }
+
+  // Constructs a polynomial from evaluations at the roots of unity.
+  // evals: Array of evaluations.
+  // size: Number of evaluations in the array.
+  // Returns a pointer to the newly created polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_create_from_rou_evaluations)(scalar_t* evals, size_t size)
+  {
+    auto result = new PolynomialInst(PolynomialInst::from_rou_evaluations(evals, size));
+    return result;
+  }
+
+  // Clones an existing polynomial instance.
+  // p: Pointer to the polynomial instance to clone.
+  // Returns a pointer to the cloned polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_clone)(const PolynomialInst* p)
+  {
+    auto result = new PolynomialInst(p->clone());
+    return result;
+  }
+
+  // Deletes a polynomial instance, freeing its memory.
+  // instance: Pointer to the polynomial instance to delete.
+  void CONCAT_EXPAND(FIELD, polynomial_delete)(PolynomialInst* instance) { delete instance; }
+
+  // Adds two polynomials.
+  // a, b: Pointers to the polynomial instances to add.
+  // Returns a pointer to the resulting polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_add)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a + *b));
+    return result;
+  }
+
+  // Adds a polynomial to another in place.
+  // a: Pointer to the polynomial to add to.
+  // b: Pointer to the polynomial to add.
+  void CONCAT_EXPAND(FIELD, polynomial_add_inplace)(PolynomialInst* a, const PolynomialInst* b) { *a += *b; }
+
+  // Subtracts one polynomial from another.
+  // a, b: Pointers to the polynomial instances (minuend and subtrahend, respectively).
+  // Returns a pointer to the resulting polynomial instance.
+
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_subtract)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a - *b));
+    return result;
+  }
+
+  // Multiplies two polynomials.
+  // a, b: Pointers to the polynomial instances to multiply.
+  // Returns a pointer to the resulting polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_multiply)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a * *b));
+    return result;
+  }
+
+  // Multiplies a polynomial by a scalar coefficient.
+  // a: Pointer to the polynomial instance.
+  // coeff: Scalar coefficient to multiply by.
+  // Returns a pointer to the resulting polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_multiply_by_coeff)(const PolynomialInst* a, const scalar_t* coeff)
+  {
+    auto result = new PolynomialInst(std::move(*a * *coeff));
+    return result;
+  }
+
+  // Divides one polynomial by another, returning both quotient and remainder.
+  // a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
+  // q: Output parameter for the quotient.
+  // r: Output parameter for the remainder.
+  void CONCAT_EXPAND(FIELD, polynomial_division)(
+    const PolynomialInst* a, const PolynomialInst* b, PolynomialInst** q /*OUT*/, PolynomialInst** r /*OUT*/)
+  {
+    auto [_q, _r] = a->divide(*b);
+    *q = new PolynomialInst(std::move(_q));
+    *r = new PolynomialInst(std::move(_r));
+  }
+
+  // Calculates the quotient of dividing one polynomial by another.
+  // a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
+  // Returns a pointer to the resulting quotient polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_quotient)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a / *b));
+    return result;
+  }
+
+  // Calculates the remainder of dividing one polynomial by another.
+  // a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
+  // Returns a pointer to the resulting remainder polynomial instance.
+
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_remainder)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a % *b));
+    return result;
+  }
+
+  // Divides a polynomial by a vanishing polynomial of a given degree, over rou domain.
+  // p: Pointer to the polynomial instance.
+  // vanishing_poly_degree: Degree of the vanishing polynomial.
+  // Returns a pointer to the resulting polynomial instance.
+  PolynomialInst*
+  CONCAT_EXPAND(FIELD, polynomial_divide_by_vanishing)(const PolynomialInst* p, uint64_t vanishing_poly_degree)
+  {
+    auto result = new PolynomialInst(std::move(p->divide_by_vanishing_polynomial(vanishing_poly_degree)));
+    return result;
+  }
+
+  // Adds a monomial to a polynomial in place.
+  // p: Pointer to the polynomial instance.
+  // monomial_coeff: Coefficient of the monomial to add.
+  // monomial: Degree of the monomial to add.
+  void CONCAT_EXPAND(FIELD, polynomial_add_monomial_inplace)(
+    PolynomialInst* p, const scalar_t* monomial_coeff, uint64_t monomial)
+  {
+    p->add_monomial_inplace(*monomial_coeff, monomial);
+  }
+
+  // Subtracts a monomial from a polynomial in place.
+  // p: Pointer to the polynomial instance.
+  // monomial_coeff: Coefficient of the monomial to subtract.
+  // monomial: Degree of the monomial to subtract.
+  void CONCAT_EXPAND(FIELD, polynomial_sub_monomial_inplace)(
+    PolynomialInst* p, const scalar_t* monomial_coeff, uint64_t monomial)
+  {
+    p->sub_monomial_inplace(*monomial_coeff, monomial);
+  }
+
+  // Evaluates a polynomial at a given point.
+  // p: Pointer to the polynomial instance.
+  // x: Point at which to evaluate the polynomial.
+  // Returns the evaluation result.
+  scalar_t CONCAT_EXPAND(FIELD, polynomial_evaluate)(const PolynomialInst* p, const scalar_t& x)
+  {
+    return p->evaluate(x);
+  }
+
+  // Evaluates a polynomial on a domain of points.
+  // p: Pointer to the polynomial instance.
+  // domain: Array of points constituting the domain.
+  // domain_size: Number of points in the domain.
+  // evals: Output array for the evaluations.
+  void CONCAT_EXPAND(FIELD, polynomial_evaluate_on_domain)(
+    const PolynomialInst* p, scalar_t* domain, uint64_t domain_size, scalar_t* evals /*OUT*/)
+  {
+    return p->evaluate_on_domain(domain, domain_size, evals);
+  }
+
+  // Returns the degree of a polynomial.
+  // p: Pointer to the polynomial instance.
+  // Returns the degree of the polynomial.
+  int64_t CONCAT_EXPAND(FIELD, polynomial_degree)(PolynomialInst* p) { return p->degree(); }
+
+  // Copies a single coefficient of a polynomial to host memory.
+  // p: Pointer to the polynomial instance.
+  // idx: Index of the coefficient to copy.
+  // Returns the coefficient value.
+  scalar_t CONCAT_EXPAND(FIELD, polynomial_copy_single_coeff_to_host)(PolynomialInst* p, uint64_t idx)
+  {
+    return p->copy_coefficient_to_host(idx);
+  }
+
+  // Copies a range of polynomial coefficients to host memory.
+  // p: Pointer to the polynomial instance.
+  // host_memory: Array to copy the coefficients into. If NULL, not copying.
+  // start_idx: Start index of the range to copy.
+  // end_idx: End index of the range to copy.
+  // Returns the number of coefficients copied. if host_memory is NULL, returns number of coefficients.
+  int64_t CONCAT_EXPAND(FIELD, polynomial_coeffs_to_host)(
+    PolynomialInst* p, scalar_t* host_memory, uint64_t start_idx, uint64_t end_idx)
+  {
+    return p->copy_coefficients_to_host(host_memory, start_idx, end_idx);
+  }
+
+  // Retrieves a device-memory view of the polynomial coefficients.
+  // p: Pointer to the polynomial instance.
+  // size: Output parameter for the size of the view.
+  // device_id: Output parameter for the device ID.
+  // Returns a pointer to an integrity pointer encapsulating the coefficients view.
+  IntegrityPointer<scalar_t>* CONCAT_EXPAND(FIELD, polynomial_get_coeff_view)(
+    PolynomialInst* p, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
+  {
+    auto [coeffs, _size, _device_id] = p->get_coefficients_view();
+    *size = _size;
+    *device_id = _device_id;
+    return new IntegrityPointer<scalar_t>(std::move(coeffs));
+  }
+
+  // Retrieves a device-memory view of the polynomial's evaluations on the roots of unity.
+  // p: Pointer to the polynomial instance.
+  // nof_evals: Number of evaluations.
+  // is_reversed: Whether the evaluations are in reversed order.
+  // size: Output parameter for the size of the view.
+  // device_id: Output parameter for the device ID.
+  // Returns a pointer to an integrity pointer encapsulating the evaluations view.
+  IntegrityPointer<scalar_t>* CONCAT_EXPAND(FIELD, polynomial_get_rou_evaluations_view)(
+    PolynomialInst* p, uint64_t nof_evals, bool is_reversed, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
+  {
+    auto [rou_evals, _size, _device_id] = p->get_rou_evaluations_view(nof_evals, is_reversed);
+    *size = _size;
+    *device_id = _device_id;
+    return new IntegrityPointer<scalar_t>(std::move(rou_evals));
+  }
+
+  // Reads the pointer from an integrity pointer.
+  // p: Pointer to the integrity pointer.
+  // Returns the raw pointer if still valid, otherwise NULL.
+  const scalar_t* CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_get)(IntegrityPointer<scalar_t>* p)
+  {
+    return p->get();
+  }
+
+  // Checks if an integrity pointer is still valid.
+  // p: Pointer to the integrity pointer.
+  // Returns true if the pointer is valid, false otherwise.
+  bool CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_is_valid)(IntegrityPointer<scalar_t>* p) { return p->isValid(); }
+
+  // Destroys an integrity pointer, freeing its resources.
+  // p: Pointer to the integrity pointer to destroy.
+  void CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_destroy)(IntegrityPointer<scalar_t>* p) { delete p; }
+
+  } // extern "C"
+
+} // namespace polynomials
--- a/icicle/appUtils/poseidon/constants/bls12_377_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/bls12_377_poseidon.h
--- a/icicle/appUtils/poseidon/constants/bls12_381_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/bls12_381_poseidon.h
--- a/icicle/appUtils/poseidon/constants/bn254_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/bn254_poseidon.h
--- a/icicle/appUtils/poseidon/constants/bw6_761_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/bw6_761_poseidon.h
--- a/icicle/appUtils/poseidon/constants/constants_template.h
+++ b/icicle/appUtils/poseidon/constants/constants_template.h
--- a/icicle/appUtils/poseidon/constants/generate_parameters.py
+++ b/icicle/appUtils/poseidon/constants/generate_parameters.py
--- a/icicle/appUtils/poseidon/constants/grumpkin_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/grumpkin_poseidon.h
--- a/icicle/appUtils/poseidon/poseidon.cuh
+++ b/icicle/appUtils/poseidon/poseidon.cuh
@@ -4,9 +4,8 @@

 #include <cstdint>
 #include <stdexcept>
-#include "utils/device_context.cuh"
-#include "curves/curve_config.cuh"
-#include "utils/error_handler.cuh"
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
 #include "utils/utils.h"

 /**
--- a/icicle/include/poseidon/tree/merkle.cuh
+++ b/icicle/include/poseidon/tree/merkle.cuh
@@ -2,10 +2,10 @@
 #ifndef MERKLE_H
 #define MERKLE_H

-#include "utils/device_context.cuh"
-#include "utils/error_handler.cuh"
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
 #include "utils/utils.h"
-#include "appUtils/poseidon/poseidon.cuh"
+#include "poseidon/poseidon.cuh"

 #include <iostream>
 #include <math.h>
--- a/icicle/include/utils/integrity_pointer.h
+++ b/icicle/include/utils/integrity_pointer.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <memory>
+#include <iostream>
+#include <stdexcept>
+
+/**
+ * @brief A template class that wraps a raw pointer with additional checks for data integrity.
+ *
+ * IntegrityPointer is designed to wrap a raw pointer and associate it with a validation
+ * mechanism based on a counter. This counter is monitored via a std::weak_ptr, allowing
+ * the IntegrityPointer to check if the data it points to has potentially been invalidated.
+ * It is intended for scenarios where there's a need to ensure the integrity of the pointed-to
+ * data throughout the lifetime of the pointer, particularly useful in complex systems where
+ * data validity can change over time due to external factors.
+ *
+ * Usage involves providing the raw pointer to be wrapped, a std::weak_ptr to a counter, and
+ * the expected value of that counter. The IntegrityPointer can then be used much like a normal
+ * pointer, with the addition of integrity checks before access.
+ *
+ * @tparam T The type of the pointed-to object.
+ */
+
+template <typename T>
+class IntegrityPointer
+{
+public:
+  /**
+   * Constructs an IntegrityPointer wrapping a raw pointer with a validity check based on a counter.
+   *
+   * @param ptr A raw pointer to the data of type T.
+   * @param counterWeakPtr A std::weak_ptr to an int counter, used for validation.
+   * @param expectedCounterValue The expected value of the counter for the pointer to be considered valid.
+   */
+  IntegrityPointer(const T* ptr, std::weak_ptr<int> counterWeakPtr, int expectedCounterValue)
+      : m_ptr(ptr), m_counterWeakPtr(counterWeakPtr), m_expectedCounterValue(expectedCounterValue)
+  {
+  }
+  IntegrityPointer(const IntegrityPointer& other) = default;
+  IntegrityPointer(IntegrityPointer&& other) = default;
+
+  /**
+   * Retrieves the raw pointer. Use with caution, as direct access bypasses validity checks.
+   * @return T* The raw pointer to the data.
+   */
+  const T* get() const { return isValid() ? m_ptr : nullptr; }
+
+  /**
+   * Dereferences the pointer. Throws std::runtime_error if the pointer is invalid.
+   * @return A reference to the data pointed to by the raw pointer.
+   */
+  const T& operator*() const
+  {
+    assertValid();
+    return *m_ptr;
+  }
+
+  /**
+   * Provides access to the member of the pointed-to object. Throws std::runtime_error if the pointer is invalid.
+   * @return T* The raw pointer to the data.
+   */
+  const T* operator->() const
+  {
+    assertValid();
+    return m_ptr;
+  }
+
+  /**
+   * Checks whether the pointer is still considered valid by comparing the current value of the counter
+   * to the expected value.
+   * @return true if the pointer is valid, false otherwise.
+   */
+  bool isValid() const
+  {
+    if (auto counterSharedPtr = m_counterWeakPtr.lock()) { return *counterSharedPtr == m_expectedCounterValue; }
+    return false;
+  }
+
+private:
+  const T* m_ptr;                      ///< The raw pointer to the data.
+  std::weak_ptr<int> m_counterWeakPtr; ///< A weak pointer to the counter used for validation.
+  const int m_expectedCounterValue;    ///< The expected value of the counter for the pointer to be valid.
+
+  /**
+   * Asserts the validity of the pointer. Throws std::runtime_error if the pointer is invalid.
+   */
+  void assertValid() const
+  {
+    if (!isValid()) {
+      logInvalidAccess();
+      throw std::runtime_error("Attempted to access invalidated IntegrityPointer.");
+    }
+  }
+
+  /**
+   * Logs an attempt to access an invalidated pointer.
+   */
+  static void logInvalidAccess()
+  {
+    std::cerr << "Warning: Attempted to access invalidated IntegrityPointer." << std::endl;
+  }
+};
--- a/icicle/include/utils/mont.cuh
+++ b/icicle/include/utils/mont.cuh
--- a/icicle/include/utils/test_functions.cuh
+++ b/icicle/include/utils/test_functions.cuh
@@ -0,0 +1,142 @@
+#pragma once
+
+template <class T1, class T2, int N_REP>
+__global__ void add_elements_kernel(const T1* x, const T2* y, T1* result, const unsigned count)
+{
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid >= count) return;
+  T1 res = x[gid];
+  T2 y_gid = y[gid];
+  for (int i = 0; i < N_REP; i++)
+    res = res + y_gid;
+  result[gid] = res;
+}
+
+template <class T1, class T2, int N_REP = 1>
+int vec_add(const T1* x, const T2* y, T1* result, const unsigned count)
+{
+  add_elements_kernel<T1, T2, N_REP><<<(count - 1) / 256 + 1, 256>>>(x, y, result, count);
+  int error = cudaGetLastError();
+  return (error || (N_REP > 1)) ? error : cudaDeviceSynchronize();
+}
+
+template <class T1, class T2>
+__global__ void sub_elements_kernel(const T1* x, const T2* y, T1* result, const unsigned count)
+{
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid >= count) return;
+  result[gid] = x[gid] - y[gid];
+}
+
+template <class T1, class T2>
+int vec_sub(const T1* x, const T2* y, T1* result, const unsigned count)
+{
+  sub_elements_kernel<T1, T2><<<(count - 1) / 256 + 1, 256>>>(x, y, result, count);
+  int error = cudaGetLastError();
+  return error ? error : cudaDeviceSynchronize();
+}
+
+template <class T>
+__global__ void neg_elements_kernel(const T* x, T* result, const unsigned count)
+{
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid >= count) return;
+  result[gid] = T::neg(x[gid]);
+}
+
+template <class T>
+int vec_neg(const T* x, T* result, const unsigned count)
+{
+  neg_elements_kernel<T><<<(count - 1) / 256 + 1, 256>>>(x, result, count);
+  int error = cudaGetLastError();
+  return error ? error : cudaDeviceSynchronize();
+}
+
+template <class F, class G, int N_REP>
+__global__ void mul_elements_kernel(const F* x, const G* y, G* result, const unsigned count)
+{
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid >= count) return;
+  F x_gid = x[gid];
+  G res = y[gid];
+  for (int i = 0; i < N_REP; i++)
+    res = res * x_gid;
+  result[gid] = res;
+}
+
+template <class F, class G, int N_REP = 1>
+int vec_mul(const F* x, const G* y, G* result, const unsigned count)
+{
+  mul_elements_kernel<F, G, N_REP><<<(count - 1) / 256 + 1, 256>>>(x, y, result, count);
+  int error = cudaGetLastError();
+  return (error || (N_REP > 1)) ? error : cudaDeviceSynchronize();
+}
+
+template <class F>
+__global__ void inv_field_elements_kernel(const F* x, F* result, const unsigned count)
+{
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid >= count) return;
+  result[gid] = F::inverse(x[gid]);
+}
+
+template <class F>
+int field_vec_inv(const F* x, F* result, const unsigned count)
+{
+  inv_field_elements_kernel<<<(count - 1) / 256 + 1, 256>>>(x, result, count);
+  int error = cudaGetLastError();
+  return error ? error : cudaDeviceSynchronize();
+}
+
+template <class F, int N_REP>
+__global__ void sqr_field_elements_kernel(const F* x, F* result, const unsigned count)
+{
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid >= count) return;
+  F x_gid = x[gid];
+  for (int i = 0; i < N_REP; i++)
+    x_gid = F::sqr(x_gid);
+  result[gid] = x_gid;
+}
+
+template <class F, int N_REP = 1>
+int field_vec_sqr(const F* x, F* result, const unsigned count)
+{
+  sqr_field_elements_kernel<F, N_REP><<<(count - 1) / 256 + 1, 256>>>(x, result, count);
+  int error = cudaGetLastError();
+  return (error || (N_REP > 1)) ? error : cudaDeviceSynchronize();
+}
+
+template <class P, class A>
+__global__ void to_affine_points_kernel(const P* x, A* result, const unsigned count)
+{
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid >= count) return;
+  result[gid] = P::to_affine(x[gid]);
+}
+
+template <class P, class A>
+int point_vec_to_affine(const P* x, A* result, const unsigned count)
+{
+  to_affine_points_kernel<P, A><<<(count - 1) / 256 + 1, 256>>>(x, result, count);
+  int error = cudaGetLastError();
+  return error ? error : cudaDeviceSynchronize();
+}
+
+template <class T>
+int device_populate_random(T* d_elements, unsigned n)
+{
+  T* h_elements = (T*)malloc(n * sizeof(T));
+  for (unsigned i = 0; i < n; i++)
+    h_elements[i] = T::rand_host();
+  return cudaMemcpy(d_elements, h_elements, sizeof(T) * n, cudaMemcpyHostToDevice);
+}
+
+template <class T>
+int device_set(T* d_elements, T el, unsigned n)
+{
+  T* h_elements = (T*)malloc(n * sizeof(T));
+  for (unsigned i = 0; i < n; i++)
+    h_elements[i] = el;
+  return cudaMemcpy(d_elements, h_elements, sizeof(T) * n, cudaMemcpyHostToDevice);
+}
--- a/icicle/include/utils/utils.h
+++ b/icicle/include/utils/utils.h
--- a/icicle/include/utils/utils_kernels.cuh
+++ b/icicle/include/utils/utils_kernels.cuh
--- a/icicle/include/vec_ops/vec_ops.cuh
+++ b/icicle/include/vec_ops/vec_ops.cuh
@@ -2,7 +2,7 @@
 #ifndef LDE_H
 #define LDE_H

-#include "device_context.cuh"
+#include "gpu-utils/device_context.cuh"

 /**
 * @namespace vec_ops
@@ -24,9 +24,6 @@ namespace vec_ops {

    bool is_result_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */

-    bool is_result_montgomery_form; /**< True if `result` vector should be in Montgomery form and false otherwise.
-                                     *   Default value: false. */
-
    bool is_async; /**< Whether to run the vector operations asynchronously. If set to `true`, the function will be
                    *   non-blocking and you'd need to synchronize it explicitly by running
                    *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
@@ -46,7 +43,6 @@ namespace vec_ops {
      false, // is_a_on_device
      false, // is_b_on_device
      false, // is_result_on_device
-      false, // is_result_montgomery_form
      false, // is_async
    };
    return config;
@@ -64,7 +60,7 @@ namespace vec_ops {
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   */
  template <typename E, typename S>
-  cudaError_t Mul(S* vec_a, E* vec_b, int n, VecOpsConfig<E>& config, E* result);
+  cudaError_t Mul(const S* vec_a, const E* vec_b, int n, VecOpsConfig<E>& config, E* result);

  /**
   * A function that adds two vectors element-wise.
@@ -79,7 +75,7 @@ namespace vec_ops {
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   */
  template <typename E>
-  cudaError_t Add(E* vec_a, E* vec_b, int n, VecOpsConfig<E>& config, E* result);
+  cudaError_t Add(const E* vec_a, const E* vec_b, int n, VecOpsConfig<E>& config, E* result);

  /**
   * A function that subtracts two vectors element-wise.
@@ -94,7 +90,7 @@ namespace vec_ops {
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   */
  template <typename E>
-  cudaError_t Sub(E* vec_a, E* vec_b, int n, VecOpsConfig<E>& config, E* result);
+  cudaError_t Sub(const E* vec_a, const E* vec_b, int n, VecOpsConfig<E>& config, E* result);

  /**
   * Transposes an input matrix out-of-place inside GPU.
--- a/icicle/primitives/field.cu
+++ b/icicle/primitives/field.cu
@@ -1,10 +0,0 @@
-#include "curves/curve_config.cuh"
-#include "field.cuh"
-#include "utils/utils.h"
-
-using namespace curve_config;
-
-extern "C" void CONCAT_EXPAND(CURVE, GenerateScalars)(scalar_t* scalars, int size)
-{
-  scalar_t::RandHostMany(scalars, size);
-}
--- a/icicle/primitives/test_kernels.cuh
+++ b/icicle/primitives/test_kernels.cuh
@@ -1,113 +0,0 @@
-#pragma once
-
-#include "curves/curve_config.cuh"
-
-using namespace curve_config;
-
-template <class T1, class T2>
-__global__ void add_elements_kernel(const T1* x, const T2* y, T1* result, const unsigned count)
-{
-  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= count) return;
-  result[gid] = x[gid] + y[gid];
-}
-
-template <class T1, class T2>
-int vec_add(const T1* x, const T2* y, T1* result, const unsigned count)
-{
-  add_elements_kernel<T1, T2><<<(count - 1) / 32 + 1, 32>>>(x, y, result, count);
-  int error = cudaGetLastError();
-  return error ? error : cudaDeviceSynchronize();
-}
-
-template <class T1, class T2>
-__global__ void sub_elements_kernel(const T1* x, const T2* y, T1* result, const unsigned count)
-{
-  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= count) return;
-  result[gid] = x[gid] - y[gid];
-}
-
-template <class T1, class T2>
-int vec_sub(const T1* x, const T2* y, T1* result, const unsigned count)
-{
-  sub_elements_kernel<T1, T2><<<(count - 1) / 32 + 1, 32>>>(x, y, result, count);
-  int error = cudaGetLastError();
-  return error ? error : cudaDeviceSynchronize();
-}
-
-template <class T>
-__global__ void neg_elements_kernel(const T* x, T* result, const unsigned count)
-{
-  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= count) return;
-  result[gid] = T::neg(x[gid]);
-}
-
-template <class T>
-int vec_neg(const T* x, T* result, const unsigned count)
-{
-  neg_elements_kernel<T><<<(count - 1) / 32 + 1, 32>>>(x, result, count);
-  int error = cudaGetLastError();
-  return error ? error : cudaDeviceSynchronize();
-}
-
-template <class F, class G>
-__global__ void mul_elements_kernel(const F* x, const G* y, G* result, const unsigned count)
-{
-  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= count) return;
-  result[gid] = x[gid] * y[gid];
-}
-
-template <class F, class G>
-int vec_mul(const F* x, const G* y, G* result, const unsigned count)
-{
-  mul_elements_kernel<F, G><<<(count - 1) / 32 + 1, 32>>>(x, y, result, count);
-  int error = cudaGetLastError();
-  return error ? error : cudaDeviceSynchronize();
-}
-
-__global__ void inv_field_elements_kernel(const scalar_t* x, scalar_t* result, const unsigned count)
-{
-  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= count) return;
-  result[gid] = scalar_t::inverse(x[gid]);
-}
-
-int field_vec_inv(const scalar_t* x, scalar_t* result, const unsigned count)
-{
-  inv_field_elements_kernel<<<(count - 1) / 32 + 1, 32>>>(x, result, count);
-  int error = cudaGetLastError();
-  return error ? error : cudaDeviceSynchronize();
-}
-
-__global__ void sqr_field_elements_kernel(const scalar_t* x, scalar_t* result, const unsigned count)
-{
-  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= count) return;
-  result[gid] = scalar_t::sqr(x[gid]);
-}
-
-int field_vec_sqr(const scalar_t* x, scalar_t* result, const unsigned count)
-{
-  sqr_field_elements_kernel<<<(count - 1) / 32 + 1, 32>>>(x, result, count);
-  int error = cudaGetLastError();
-  return error ? error : cudaDeviceSynchronize();
-}
-
-template <class P, class A>
-__global__ void to_affine_points_kernel(const P* x, A* result, const unsigned count)
-{
-  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= count) return;
-  result[gid] = P::to_affine(x[gid]);
-}
-
-template <class P, class A>
-int point_vec_to_affine(const P* x, A* result, const unsigned count)
-{
-  to_affine_points_kernel<P, A><<<(count - 1) / 32 + 1, 32>>>(x, result, count);
-  int error = cudaGetLastError();
-  return error ? error : cudaDeviceSynchronize();
-}
--- a/icicle/src/curves/CMakeLists.txt
+++ b/icicle/src/curves/CMakeLists.txt
@@ -0,0 +1,25 @@
+if (G2)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2")
+endif ()
+
+set(TARGET icicle_curve)
+set(FIELD_TARGET icicle_field)
+
+set(SRC ${CMAKE_SOURCE_DIR}/src)
+
+set(CURVE_SOURCE ${SRC}/curves/extern.cu)
+list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cu)
+if(G2)
+  list(APPEND CURVE_SOURCE ${SRC}/curves/extern_g2.cu)
+  list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cu)
+endif()
+if(ECNTT)
+  list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cu)
+  list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cu)
+endif()
+
+add_library(${TARGET} STATIC ${CURVE_SOURCE})
+target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
+target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
+target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
--- a/icicle/src/curves/extern.cu
+++ b/icicle/src/curves/extern.cu
@@ -0,0 +1,51 @@
+#include "curves/curve_config.cuh"
+
+using namespace curve_config;
+
+#include "gpu-utils/device_context.cuh"
+#include "utils/utils.h"
+#include "utils/mont.cuh"
+
+extern "C" bool CONCAT_EXPAND(CURVE, Eq)(projective_t* point1, projective_t* point2)
+{
+  return (*point1 == *point2) &&
+         !((point1->x == point_field_t::zero()) && (point1->y == point_field_t::zero()) &&
+           (point1->z == point_field_t::zero())) &&
+         !((point2->x == point_field_t::zero()) && (point2->y == point_field_t::zero()) &&
+           (point2->z == point_field_t::zero()));
+}
+
+extern "C" void CONCAT_EXPAND(CURVE, ToAffine)(projective_t* point, affine_t* point_out)
+{
+  *point_out = projective_t::to_affine(*point);
+}
+
+extern "C" void CONCAT_EXPAND(CURVE, GenerateProjectivePoints)(projective_t* points, int size)
+{
+  projective_t::RandHostMany(points, size);
+}
+
+extern "C" void CONCAT_EXPAND(CURVE, GenerateAffinePoints)(affine_t* points, int size)
+{
+  projective_t::RandHostManyAffine(points, size);
+}
+
+extern "C" cudaError_t CONCAT_EXPAND(CURVE, AffineConvertMontgomery)(
+  affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+{
+  if (is_into) {
+    return mont::ToMontgomery(d_inout, n, ctx.stream, d_inout);
+  } else {
+    return mont::FromMontgomery(d_inout, n, ctx.stream, d_inout);
+  }
+}
+
+extern "C" cudaError_t CONCAT_EXPAND(CURVE, ProjectiveConvertMontgomery)(
+  projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+{
+  if (is_into) {
+    return mont::ToMontgomery(d_inout, n, ctx.stream, d_inout);
+  } else {
+    return mont::FromMontgomery(d_inout, n, ctx.stream, d_inout);
+  }
+}
--- a/icicle/primitives/projective.cu
+++ b/icicle/primitives/projective.cu
@@ -1,37 +1,10 @@
 #include "curves/curve_config.cuh"
-#include "projective.cuh"
-#include <cuda.h>
+
+using namespace curve_config;
+
+#include "gpu-utils/device_context.cuh"
 #include "utils/utils.h"
-
-using namespace curve_config;
-
-extern "C" bool CONCAT_EXPAND(CURVE, Eq)(projective_t* point1, projective_t* point2)
-{
-  return (*point1 == *point2) &&
-         !((point1->x == point_field_t::zero()) && (point1->y == point_field_t::zero()) &&
-           (point1->z == point_field_t::zero())) &&
-         !((point2->x == point_field_t::zero()) && (point2->y == point_field_t::zero()) &&
-           (point2->z == point_field_t::zero()));
-}
-
-extern "C" void CONCAT_EXPAND(CURVE, ToAffine)(projective_t* point, affine_t* point_out)
-{
-  *point_out = projective_t::to_affine(*point);
-}
-
-extern "C" void CONCAT_EXPAND(CURVE, GenerateProjectivePoints)(projective_t* points, int size)
-{
-  projective_t::RandHostMany(points, size);
-}
-
-extern "C" void CONCAT_EXPAND(CURVE, GenerateAffinePoints)(affine_t* points, int size)
-{
-  projective_t::RandHostManyAffine(points, size);
-}
-
-#if defined(G2_DEFINED)
-
-using namespace curve_config;
+#include "utils/mont.cuh"

 extern "C" bool CONCAT_EXPAND(CURVE, G2Eq)(g2_projective_t* point1, g2_projective_t* point2)
 {
@@ -57,4 +30,22 @@ extern "C" void CONCAT_EXPAND(CURVE, G2GenerateAffinePoints)(g2_affine_t* points
  g2_projective_t::RandHostManyAffine(points, size);
 }

-#endif
+extern "C" cudaError_t CONCAT_EXPAND(CURVE, G2AffineConvertMontgomery)(
+  g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+{
+  if (is_into) {
+    return mont::ToMontgomery(d_inout, n, ctx.stream, d_inout);
+  } else {
+    return mont::FromMontgomery(d_inout, n, ctx.stream, d_inout);
+  }
+}
+
+extern "C" cudaError_t CONCAT_EXPAND(CURVE, G2ProjectiveConvertMontgomery)(
+  g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+{
+  if (is_into) {
+    return mont::ToMontgomery(d_inout, n, ctx.stream, d_inout);
+  } else {
+    return mont::FromMontgomery(d_inout, n, ctx.stream, d_inout);
+  }
+}
--- a/icicle/src/fields/CMakeLists.txt
+++ b/icicle/src/fields/CMakeLists.txt
@@ -0,0 +1,39 @@
+if (EXT_FIELD)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DEXT_FIELD")
+endif ()
+
+SET(SUPPORTED_FIELDS_WITHOUT_NTT grumpkin)
+
+set(TARGET icicle_field)
+
+set(SRC ${CMAKE_SOURCE_DIR}/src)
+
+set(FIELD_SOURCE ${SRC}/fields/extern.cu)
+list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern.cu)
+if(EXT_FIELD)
+  list(APPEND FIELD_SOURCE ${SRC}/fields/extern_extension.cu)
+  list(APPEND FIELD_SOURCE ${SRC}/ntt/extern_extension.cu)
+  list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern_extension.cu)
+endif()
+
+set(POLYNOMIAL_SOURCE_FILES 
+    ${SRC}/polynomials/polynomials.cu
+    ${SRC}/polynomials/cuda_backend/polynomial_cuda_backend.cu)
+
+list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES})
+
+# TODO: impl poseidon for small fields. note that it needs to be defined for the extension field!
+if (DEFINED CURVE)
+  list(APPEND FIELD_SOURCE ${SRC}/poseidon/poseidon.cu)
+  list(APPEND FIELD_SOURCE ${SRC}/poseidon/tree/merkle.cu)
+endif()
+
+if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
+  list(APPEND FIELD_SOURCE ${SRC}/ntt/extern.cu)
+  list(APPEND FIELD_SOURCE ${SRC}/ntt/kernel_ntt.cu)
+endif()
+
+add_library(${TARGET} STATIC ${FIELD_SOURCE})
+target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_field_${FIELD}")
+target_compile_definitions(${TARGET} PUBLIC FIELD=${FIELD})
--- a/icicle/src/fields/extern.cu
+++ b/icicle/src/fields/extern.cu
@@ -0,0 +1,22 @@
+#include "fields/field_config.cuh"
+
+using namespace field_config;
+
+#include "utils/mont.cuh"
+#include "utils/utils.h"
+#include "gpu-utils/device_context.cuh"
+
+extern "C" void CONCAT_EXPAND(FIELD, GenerateScalars)(scalar_t* scalars, int size)
+{
+  scalar_t::RandHostMany(scalars, size);
+}
+
+extern "C" cudaError_t CONCAT_EXPAND(FIELD, ScalarConvertMontgomery)(
+  scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+{
+  if (is_into) {
+    return mont::ToMontgomery(d_inout, n, ctx.stream, d_inout);
+  } else {
+    return mont::FromMontgomery(d_inout, n, ctx.stream, d_inout);
+  }
+}
--- a/icicle/src/fields/extern_extension.cu
+++ b/icicle/src/fields/extern_extension.cu
@@ -0,0 +1,22 @@
+#include "fields/field_config.cuh"
+
+using namespace field_config;
+
+#include "utils/mont.cuh"
+#include "utils/utils.h"
+#include "gpu-utils/device_context.cuh"
+
+extern "C" void CONCAT_EXPAND(FIELD, ExtensionGenerateScalars)(extension_t* scalars, int size)
+{
+  extension_t::RandHostMany(scalars, size);
+}
+
+extern "C" cudaError_t CONCAT_EXPAND(FIELD, ExtensionScalarConvertMontgomery)(
+  extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+{
+  if (is_into) {
+    return mont::ToMontgomery(d_inout, n, ctx.stream, d_inout);
+  } else {
+    return mont::FromMontgomery(d_inout, n, ctx.stream, d_inout);
+  }
+}
--- a/icicle/src/hash/CMakeLists.txt
+++ b/icicle/src/hash/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET icicle_hash)
+
+add_library(${TARGET} STATIC keccak/keccak.cu)
+target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_hash")
--- a/icicle/src/hash/keccak/Makefile
+++ b/icicle/src/hash/keccak/Makefile
--- a/icicle/src/hash/keccak/keccak.cu
+++ b/icicle/src/hash/keccak/keccak.cu
@@ -1,4 +1,4 @@
-#include "keccak.cuh"
+#include "hash/keccak/keccak.cuh"

 namespace keccak {
 #define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
--- a/icicle/src/hash/keccak/test.cu
+++ b/icicle/src/hash/keccak/test.cu
--- a/icicle/src/hash/keccak/test_keccak
+++ b/icicle/src/hash/keccak/test_keccak
--- a/icicle/appUtils/msm/Makefile
+++ b/icicle/appUtils/msm/Makefile
--- a/icicle/src/msm/extern.cu
+++ b/icicle/src/msm/extern.cu
@@ -0,0 +1,43 @@
+#include "curves/curve_config.cuh"
+#include "fields/field_config.cuh"
+
+using namespace curve_config;
+using namespace field_config;
+
+#include "msm.cu"
+#include "utils/utils.h"
+
+namespace msm {
+  /**
+   * Extern "C" version of [PrecomputeMSMBases](@ref PrecomputeMSMBases) function with the following values of template
+   * parameters (where the curve is given by `-DCURVE` env variable during build):
+   *  - `A` is the [affine representation](@ref affine_t) of curve points;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, PrecomputeMSMBases)(
+    affine_t* bases,
+    int bases_size,
+    int precompute_factor,
+    int _c,
+    bool are_bases_on_device,
+    device_context::DeviceContext& ctx,
+    affine_t* output_bases)
+  {
+    return PrecomputeMSMBases<affine_t, projective_t>(
+      bases, bases_size, precompute_factor, _c, are_bases_on_device, ctx, output_bases);
+  }
+
+  /**
+   * Extern "C" version of [MSM](@ref MSM) function with the following values of template parameters
+   * (where the curve is given by `-DCURVE` env variable during build):
+   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
+   *  - `A` is the [affine representation](@ref affine_t) of curve points;
+   *  - `P` is the [projective representation](@ref projective_t) of curve points.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, MSMCuda)(
+    const scalar_t* scalars, const affine_t* points, int msm_size, MSMConfig& config, projective_t* out)
+  {
+    return MSM<scalar_t, affine_t, projective_t>(scalars, points, msm_size, config, out);
+  }
+} // namespace msm
--- a/icicle/src/msm/extern_g2.cu
+++ b/icicle/src/msm/extern_g2.cu
@@ -0,0 +1,43 @@
+#include "curves/curve_config.cuh"
+#include "fields/field_config.cuh"
+
+using namespace curve_config;
+using namespace field_config;
+
+#include "msm.cu"
+#include "utils/utils.h"
+
+namespace msm {
+  /**
+   * Extern "C" version of [PrecomputeMSMBases](@ref PrecomputeMSMBases) function with the following values of template
+   * parameters (where the curve is given by `-DCURVE` env variable during build):
+   *  - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, G2PrecomputeMSMBases)(
+    g2_affine_t* bases,
+    int bases_size,
+    int precompute_factor,
+    int _c,
+    bool are_bases_on_device,
+    device_context::DeviceContext& ctx,
+    g2_affine_t* output_bases)
+  {
+    return PrecomputeMSMBases<g2_affine_t, g2_projective_t>(
+      bases, bases_size, precompute_factor, _c, are_bases_on_device, ctx, output_bases);
+  }
+
+  /**
+   * Extern "C" version of [MSM](@ref MSM) function with the following values of template parameters
+   * (where the curve is given by `-DCURVE` env variable during build):
+   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
+   *  - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
+   *  - `P` is the [projective representation](@ref g2_projective_t) of G2 curve points.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, G2MSMCuda)(
+    const scalar_t* scalars, const g2_affine_t* points, int msm_size, MSMConfig& config, g2_projective_t* out)
+  {
+    return MSM<scalar_t, g2_affine_t, g2_projective_t>(scalars, points, msm_size, config, out);
+  }
+} // namespace msm
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
@@ -1,4 +1,4 @@
-#include "msm.cuh"
+#include "msm/msm.cuh"

 #include <cooperative_groups.h>
 #include <cub/device/device_radix_sort.cuh>
@@ -10,13 +10,11 @@
 #include <stdexcept>
 #include <vector>

-#include "curves/curve_config.cuh"
-#include "primitives/affine.cuh"
-#include "primitives/field.cuh"
-#include "primitives/projective.cuh"
-#include "utils/error_handler.cuh"
+#include "curves/affine.cuh"
+#include "curves/projective.cuh"
+#include "fields/field.cuh"
+#include "gpu-utils/error_handler.cuh"
 #include "utils/mont.cuh"
-#include "utils/utils.h"

 namespace msm {

@@ -831,9 +829,8 @@ namespace msm {
  } // namespace

  template <typename A>
-  MSMConfig DefaultMSMConfig()
+  MSMConfig DefaultMSMConfig(const device_context::DeviceContext& ctx)
  {
-    device_context::DeviceContext ctx = device_context::get_default_device_context();
    MSMConfig config = {
      ctx,   // ctx
      0,     // points_size
@@ -853,6 +850,9 @@ namespace msm {
    return config;
  }

+  // explicit instantiation to avoid having to include this file
+  template MSMConfig DefaultMSMConfig<scalar_t>(const device_context::DeviceContext& ctx);
+
  template <typename S, typename A, typename P>
  cudaError_t MSM(const S* scalars, const A* points, int msm_size, MSMConfig& config, P* results)
  {
@@ -906,85 +906,4 @@ namespace msm {

    return CHK_LAST();
  }
-
-  /**
-   * Extern "C" version of [PrecomputeMSMBases](@ref PrecomputeMSMBases) function with the following values of
-   * template parameters (where the curve is given by `-DCURVE` env variable during build):
-   *  - `A` is the [affine representation](@ref affine_t) of curve points;
-   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
-   */
-  extern "C" cudaError_t CONCAT_EXPAND(CURVE, PrecomputeMSMBases)(
-    curve_config::affine_t* bases,
-    int bases_size,
-    int precompute_factor,
-    int _c,
-    bool are_bases_on_device,
-    device_context::DeviceContext& ctx,
-    curve_config::affine_t* output_bases)
-  {
-    return PrecomputeMSMBases<curve_config::affine_t, curve_config::projective_t>(
-      bases, bases_size, precompute_factor, _c, are_bases_on_device, ctx, output_bases);
-  }
-
-  /**
-   * Extern "C" version of [MSM](@ref MSM) function with the following values of template parameters
-   * (where the curve is given by `-DCURVE` env variable during build):
-   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
-   *  - `A` is the [affine representation](@ref affine_t) of curve points;
-   *  - `P` is the [projective representation](@ref projective_t) of curve points.
-   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
-   */
-  extern "C" cudaError_t CONCAT_EXPAND(CURVE, MSMCuda)(
-    const curve_config::scalar_t* scalars,
-    const curve_config::affine_t* points,
-    int msm_size,
-    MSMConfig& config,
-    curve_config::projective_t* out)
-  {
-    return MSM<curve_config::scalar_t, curve_config::affine_t, curve_config::projective_t>(
-      scalars, points, msm_size, config, out);
-  }
-
-#if defined(G2_DEFINED)
-
-  /**
-   * Extern "C" version of [PrecomputeMSMBases](@ref PrecomputeMSMBases) function with the following values of
-   * template parameters (where the curve is given by `-DCURVE` env variable during build):
-   *  - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
-   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
-   */
-  extern "C" cudaError_t CONCAT_EXPAND(CURVE, G2PrecomputeMSMBases)(
-    curve_config::g2_affine_t* bases,
-    int bases_size,
-    int precompute_factor,
-    int _c,
-    bool are_bases_on_device,
-    device_context::DeviceContext& ctx,
-    curve_config::g2_affine_t* output_bases)
-  {
-    return PrecomputeMSMBases<curve_config::g2_affine_t, curve_config::g2_projective_t>(
-      bases, bases_size, precompute_factor, _c, are_bases_on_device, ctx, output_bases);
-  }
-
-  /**
-   * Extern "C" version of [MSM](@ref MSM) function with the following values of template parameters
-   * (where the curve is given by `-DCURVE` env variable during build):
-   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
-   *  - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
-   *  - `P` is the [projective representation](@ref g2_projective_t) of G2 curve points.
-   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
-   */
-  extern "C" cudaError_t CONCAT_EXPAND(CURVE, G2MSMCuda)(
-    const curve_config::scalar_t* scalars,
-    const curve_config::g2_affine_t* points,
-    int msm_size,
-    MSMConfig& config,
-    curve_config::g2_projective_t* out)
-  {
-    return MSM<curve_config::scalar_t, curve_config::g2_affine_t, curve_config::g2_projective_t>(
-      scalars, points, msm_size, config, out);
-  }
-
-#endif
-
-} // namespace msm
+} // namespace msm
--- a/icicle/appUtils/msm/tests/msm_test.cu
+++ b/icicle/appUtils/msm/tests/msm_test.cu
@@ -6,10 +6,12 @@
 #include <iostream>
 #include <vector>

-#include "curves/curve_config.cuh"
-#include "primitives/field.cuh"
-#include "primitives/projective.cuh"
-#include "utils/device_context.cuh"
+#include "curves/bn254/bn254.cuh"
+#include "fields/field.cuh"
+#include "curves/projective.cuh"
+#include "gpu-utils/device_context.cuh"
+
+using namespace bn254;

 class Dummy_Scalar
 {
@@ -111,9 +113,9 @@ public:

 // switch between dummy and real:

-typedef curve_config::scalar_t test_scalar;
-typedef curve_config::projective_t test_projective;
-typedef curve_config::affine_t test_affine;
+typedef scalar_t test_scalar;
+typedef projective_t test_projective;
+typedef affine_t test_affine;

 // typedef Dummy_Scalar test_scalar;
 // typedef Dummy_Projective test_projective;
@@ -136,7 +138,7 @@ int main()

  // projective_t *short_res = (projective_t*)malloc(sizeof(projective_t));
  // test_projective *large_res = (test_projective*)malloc(sizeof(test_projective));
-  test_projective large_res[batch_size];
+  test_projective large_res[2];
  // test_projective batched_large_res[batch_size];
  // fake_point *large_res = (fake_point*)malloc(sizeof(fake_point));
  // fake_point batched_large_res[256];
@@ -195,7 +197,9 @@ int main()
  printf("No Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9);
  config.is_big_triangle = true;
  config.are_results_on_device = false;
-  std::cout << test_projective::to_affine(large_res[0]) << std::endl;
+  cudaMemcpy(&large_res[1], large_res_d, sizeof(test_projective), cudaMemcpyDeviceToHost);
+  std::cout << test_projective::to_affine(large_res[1]) << " " << test_projective::is_on_curve(large_res[1])
+            << std::endl;
  auto begin = std::chrono::high_resolution_clock::now();
  msm::MSM<test_scalar, test_affine, test_projective>(scalars_d, points_d, msm_size, config, large_res);
  // test_reduce_triangle(scalars);
@@ -208,10 +212,6 @@ int main()
  cudaStreamSynchronize(stream);
  cudaStreamDestroy(stream);

-  std::cout << test_projective::to_affine(large_res[0]) << std::endl;
-
-  cudaMemcpy(&large_res[1], large_res_d, sizeof(test_projective), cudaMemcpyDeviceToHost);
-
  //   reference_msm<test_affine, test_scalar, test_projective>(scalars, points, msm_size);

  // std::cout<<"final results batched large"<<std::endl;
--- a/icicle/appUtils/ntt/Makefile
+++ b/icicle/appUtils/ntt/Makefile
@@ -1,13 +1,13 @@
 build_verification:
 	mkdir -p work
-	nvcc -o work/test_verification -I. -I.. -I../.. -I../ntt tests/verification.cu -std=c++17
+	nvcc -o work/test_verification -I. -I../../include tests/verification.cu -std=c++17

 test_verification: build_verification
 	work/test_verification

 build_verification_ecntt:
 	mkdir -p work
-	nvcc -o work/test_verification_ecntt -I. -I.. -I../.. -I../ntt tests/verification.cu -std=c++17 -DECNTT_DEFINED
+	nvcc -o work/test_verification_ecntt -I. -I../../include tests/verification.cu -std=c++17 -DECNTT

 test_verification_ecntt: build_verification_ecntt
 	work/test_verification_ecntt
--- a/icicle/src/ntt/extern.cu
+++ b/icicle/src/ntt/extern.cu
@@ -0,0 +1,47 @@
+#include "fields/field_config.cuh"
+
+using namespace field_config;
+
+#include "ntt.cu"
+
+#include "gpu-utils/device_context.cuh"
+#include "utils/utils.h"
+
+namespace ntt {
+  /**
+   * Extern "C" version of [InitDomain](@ref InitDomain) function with the following
+   * value of template parameter (where the field is given by `-DFIELD` env variable during build):
+   *  - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
+   * stand-alone "STARK field";
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, InitializeDomain)(
+    scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
+  {
+    return InitDomain(*primitive_root, ctx, fast_twiddles_mode);
+  }
+
+  /**
+   * Extern "C" version of [NTT](@ref NTT) function with the following values of template parameters
+   * (where the field is given by `-DFIELD` env variable during build):
+   *  - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
+   * stand-alone "STARK field";
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, NTTCuda)(
+    const scalar_t* input, int size, NTTDir dir, NTTConfig<scalar_t>& config, scalar_t* output)
+  {
+    return NTT<scalar_t, scalar_t>(input, size, dir, config, output);
+  }
+
+  /**
+   * Extern "C" version of [ReleaseDomain](@ref ReleaseDomain) function with the following values of template parameters
+   * (where the field is given by `-DFIELD` env variable during build):
+   *  - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
+   * stand-alone "STARK field";
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, ReleaseDomain)(device_context::DeviceContext& ctx)
+  {
+    return ReleaseDomain<scalar_t>(ctx);
+  }
+} // namespace ntt
--- a/icicle/src/ntt/extern_ecntt.cu
+++ b/icicle/src/ntt/extern_ecntt.cu
@@ -0,0 +1,25 @@
+#include "curves/curve_config.cuh"
+#include "fields/field_config.cuh"
+
+using namespace curve_config;
+using namespace field_config;
+
+#include "ntt.cu"
+
+#include "gpu-utils/device_context.cuh"
+#include "utils/utils.h"
+
+namespace ntt {
+  /**
+   * Extern "C" version of [NTT](@ref NTT) function with the following values of template parameters
+   * (where the curve is given by `-DCURVE` env variable during build):
+   *  - `S` is the [projective representation](@ref projective_t) of the curve (i.e. EC NTT is computed);
+   *  - `E` is the [scalar field](@ref scalar_t) of the curve;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, ECNTTCuda)(
+    const projective_t* input, int size, NTTDir dir, NTTConfig<scalar_t>& config, projective_t* output)
+  {
+    return NTT<scalar_t, projective_t>(input, size, dir, config, output);
+  }
+} // namespace ntt
--- a/icicle/src/ntt/extern_extension.cu
+++ b/icicle/src/ntt/extern_extension.cu
@@ -0,0 +1,23 @@
+#include "fields/field_config.cuh"
+
+using namespace field_config;
+
+#include "ntt.cu"
+
+#include "gpu-utils/device_context.cuh"
+#include "utils/utils.h"
+
+namespace ntt {
+  /**
+   * Extern "C" version of [NTT](@ref NTT) function with the following values of template parameters
+   * (where the field is given by `-DFIELD` env variable during build):
+   *  - `E` is the [field](@ref scalar_t);
+   *  - `S` is the [extension](@ref extension_t) of `E` of appropriate degree;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, ExtensionNTTCuda)(
+    const extension_t* input, int size, NTTDir dir, NTTConfig<scalar_t>& config, extension_t* output)
+  {
+    return NTT<scalar_t, extension_t>(input, size, dir, config, output);
+  }
+} // namespace ntt
--- a/icicle/appUtils/ntt/kernel_ntt.cu
+++ b/icicle/appUtils/ntt/kernel_ntt.cu
@@ -1,8 +1,10 @@
+#include "fields/field_config.cuh"

-#include "appUtils/ntt/thread_ntt.cu"
-#include "curves/curve_config.cuh"
-#include "utils/sharedmem.cuh"
-#include "appUtils/ntt/ntt.cuh" // for ntt::Ordering
+using namespace field_config;
+
+#include "thread_ntt.cu"
+#include "gpu-utils/sharedmem.cuh"
+#include "ntt/ntt.cuh" // for ntt::Ordering

 namespace mxntt {

@@ -998,27 +1000,27 @@ namespace mxntt {

  // Explicit instantiation for scalar type
  template cudaError_t generate_external_twiddles_generic(
-    const curve_config::scalar_t& basic_root,
-    curve_config::scalar_t* external_twiddles,
-    curve_config::scalar_t*& internal_twiddles,
-    curve_config::scalar_t*& basic_twiddles,
+    const scalar_t& basic_root,
+    scalar_t* external_twiddles,
+    scalar_t*& internal_twiddles,
+    scalar_t*& basic_twiddles,
    uint32_t log_size,
    cudaStream_t& stream);

  template cudaError_t generate_external_twiddles_fast_twiddles_mode(
-    const curve_config::scalar_t& basic_root,
-    curve_config::scalar_t* external_twiddles,
-    curve_config::scalar_t*& internal_twiddles,
-    curve_config::scalar_t*& basic_twiddles,
+    const scalar_t& basic_root,
+    scalar_t* external_twiddles,
+    scalar_t*& internal_twiddles,
+    scalar_t*& basic_twiddles,
    uint32_t log_size,
    cudaStream_t& stream);

-  template cudaError_t mixed_radix_ntt<curve_config::scalar_t, curve_config::scalar_t>(
-    const curve_config::scalar_t* d_input,
-    curve_config::scalar_t* d_output,
-    curve_config::scalar_t* external_twiddles,
-    curve_config::scalar_t* internal_twiddles,
-    curve_config::scalar_t* basic_twiddles,
+  template cudaError_t mixed_radix_ntt<scalar_t, scalar_t>(
+    const scalar_t* d_input,
+    scalar_t* d_output,
+    scalar_t* external_twiddles,
+    scalar_t* internal_twiddles,
+    scalar_t* basic_twiddles,
    int ntt_size,
    int max_logn,
    int batch_size,
@@ -1026,17 +1028,37 @@ namespace mxntt {
    bool is_inverse,
    bool fast_tw,
    ntt::Ordering ordering,
-    curve_config::scalar_t* arbitrary_coset,
+    scalar_t* arbitrary_coset,
    int coset_gen_index,
    cudaStream_t cuda_stream);
+
+#if defined(EXT_FIELD)
+  template cudaError_t mixed_radix_ntt<extension_t, scalar_t>(
+    const extension_t* d_input,
+    extension_t* d_output,
+    scalar_t* external_twiddles,
+    scalar_t* internal_twiddles,
+    scalar_t* basic_twiddles,
+    int ntt_size,
+    int max_logn,
+    int batch_size,
+    bool columns_batch,
+    bool is_inverse,
+    bool fast_tw,
+    ntt::Ordering ordering,
+    scalar_t* arbitrary_coset,
+    int coset_gen_index,
+    cudaStream_t cuda_stream);
+#endif
+
  // TODO: we may reintroduce mixed-radix ECNTT based on upcoming benching PR
-  // #if defined(ECNTT_DEFINED)
-  //   template cudaError_t mixed_radix_ntt<curve_config::projective_t, curve_config::scalar_t>(
-  //     curve_config::projective_t* d_input,
-  //     curve_config::projective_t* d_output,
-  //     curve_config::scalar_t* external_twiddles,
-  //     curve_config::scalar_t* internal_twiddles,
-  //     curve_config::scalar_t* basic_twiddles,
+  // #if defined(ECNTT)
+  //   template cudaError_t mixed_radix_ntt<projective_t, scalar_t>(
+  //     projective_t* d_input,
+  //     projective_t* d_output,
+  //     scalar_t* external_twiddles,
+  //     scalar_t* internal_twiddles,
+  //     scalar_t* basic_twiddles,
  //     int ntt_size,
  //     int max_logn,
  //     int batch_size,
@@ -1044,8 +1066,8 @@ namespace mxntt {
  //     bool is_inverse,
  //     bool fast_tw,
  //     ntt::Ordering ordering,
-  //     curve_config::scalar_t* arbitrary_coset,
+  //     scalar_t* arbitrary_coset,
  //     int coset_gen_index,
  //     cudaStream_t cuda_stream);
-  // #endif // ECNTT_DEFINED
+  // #endif // ECNTT
 } // namespace mxntt
--- a/icicle/appUtils/ntt/ntt.cu
+++ b/icicle/appUtils/ntt/ntt.cu
@@ -1,19 +1,27 @@
-#include "ntt.cuh"
+#include "fields/field_config.cuh"
+
+using namespace field_config;
+
+#include "ntt/ntt.cuh"

 #include <unordered_map>
 #include <vector>
 #include <type_traits>

-#include "curves/curve_config.cuh"
-#include "utils/sharedmem.cuh"
+#include "gpu-utils/sharedmem.cuh"
 #include "utils/utils_kernels.cuh"
 #include "utils/utils.h"
-#include "appUtils/ntt/ntt_impl.cuh"
-#include "appUtils/ntt/ntt.cuh" // for ntt::Ordering
+#include "ntt/ntt_impl.cuh"

 #include <mutex>

-#define IS_ECNTT std::is_same_v<E, curve_config::projective_t>
+#ifdef CURVE_ID
+#include "curves/curve_config.cuh"
+using namespace curve_config;
+#define IS_ECNTT std::is_same_v<E, projective_t>
+#else
+#define IS_ECNTT false
+#endif

 namespace ntt {

@@ -404,6 +412,9 @@ namespace ntt {
    template <typename U>
    friend cudaError_t ReleaseDomain(device_context::DeviceContext& ctx);

+    template <typename U>
+    friend U GetRootOfUnity<U>(uint64_t logn, device_context::DeviceContext& ctx);
+
    template <typename U, typename E>
    friend cudaError_t NTT<U, E>(const E* input, int size, NTTDir dir, NTTConfig<U>& config, E* output);
  };
@@ -528,6 +539,22 @@ namespace ntt {
    return CHK_LAST();
  }

+  template <typename S>
+  S GetRootOfUnity(uint64_t logn, device_context::DeviceContext& ctx)
+  {
+    Domain<S>& domain = domains_for_devices<S>[ctx.device_id];
+    if (logn > domain.max_log_size) {
+      std::ostringstream oss;
+      oss << "NTT log_size=" << logn
+          << " is too large for the domain. Consider generating your domain with a higher order root of unity.\n";
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, oss.str().c_str());
+    }
+    const size_t twiddles_idx = 1ULL << (domain.max_log_size - logn);
+    return domain.twiddles[twiddles_idx];
+  }
+  // explicit instantiation to avoid having to include this file
+  template scalar_t GetRootOfUnity(uint64_t logn, device_context::DeviceContext& ctx);
+
  template <typename S>
  static bool is_choosing_radix2_algorithm(int logn, int batch_size, const NTTConfig<S>& config)
  {
@@ -588,7 +615,6 @@ namespace ntt {
      break;
    case Ordering::kRN:
    case Ordering::kMN:
-      dit = true;
      reverse_input = false;
    }

@@ -706,9 +732,8 @@ namespace ntt {
  }

  template <typename S>
-  NTTConfig<S> DefaultNTTConfig()
+  NTTConfig<S> DefaultNTTConfig(const device_context::DeviceContext& ctx)
  {
-    device_context::DeviceContext ctx = device_context::get_default_device_context();
    NTTConfig<S> config = {
      ctx,                // ctx
      S::one(),           // coset_gen
@@ -722,63 +747,6 @@ namespace ntt {
    };
    return config;
  }
-
-  /**
-   * Extern "C" version of [InitDomain](@ref InitDomain) function with the following
-   * value of template parameter (where the curve is given by `-DCURVE` env variable during build):
-   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
-   */
-  extern "C" cudaError_t CONCAT_EXPAND(CURVE, InitializeDomain)(
-    curve_config::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
-  {
-    return InitDomain(*primitive_root, ctx, fast_twiddles_mode);
-  }
-
-  /**
-   * Extern "C" version of [NTT](@ref NTT) function with the following values of template parameters
-   * (where the curve is given by `-DCURVE` env variable during build):
-   *  - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve;
-   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
-   */
-  extern "C" cudaError_t CONCAT_EXPAND(CURVE, NTTCuda)(
-    const curve_config::scalar_t* input,
-    int size,
-    NTTDir dir,
-    NTTConfig<curve_config::scalar_t>& config,
-    curve_config::scalar_t* output)
-  {
-    return NTT<curve_config::scalar_t, curve_config::scalar_t>(input, size, dir, config, output);
-  }
-
-  /**
-   * Extern "C" version of [ReleaseDomain](@ref ReleaseDomain) function with the following values of template parameters
-   * (where the curve is given by `-DCURVE` env variable during build):
-   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
-   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
-   */
-  extern "C" cudaError_t CONCAT_EXPAND(CURVE, ReleaseDomain)(device_context::DeviceContext& ctx)
-  {
-    return ReleaseDomain<curve_config::scalar_t>(ctx);
-  }
-
-#if defined(ECNTT_DEFINED)
-  /**
-   * Extern "C" version of [NTT](@ref NTT) function with the following values of template parameters
-   * (where the curve is given by `-DCURVE` env variable during build):
-   *  - `S` is the [projective representation](@ref projective_t) of the curve (i.e. EC NTT is computed);
-   *  - `E` is the [scalar field](@ref scalar_t) of the curve;
-   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
-   */
-  extern "C" cudaError_t CONCAT_EXPAND(CURVE, ECNTTCuda)(
-    const curve_config::projective_t* input,
-    int size,
-    NTTDir dir,
-    NTTConfig<curve_config::scalar_t>& config,
-    curve_config::projective_t* output)
-  {
-    return NTT<curve_config::scalar_t, curve_config::projective_t>(input, size, dir, config, output);
-  }
-
-#endif
-
+  // explicit instantiation to avoid having to include this file
+  template NTTConfig<scalar_t> DefaultNTTConfig(const device_context::DeviceContext& ctx);
 } // namespace ntt
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Soowon Jeong	e02de2f485	Specify curve as bn254	2024-08-13 17:59:48 +09:00
Ido Atlas	7a32a88d6d	integration with V2	2024-04-17 11:19:38 +03:00
Ido Atlas	4cc5d2d71c	Merge branch 'V2' into sumcheck-experiments	2024-04-15 17:43:10 +03:00
yshekel	6e3eecd8ca	Polynomial API and CUDA backend (1) C++ template API that is backend (GPU, ZPU or other) agnostic (polynomials.h) (2) concrete CUDA implementation (polynomial_cuda_backend.cu/cuh) (3) C API for FFI (4) Groth16 example (test) (5) icicle library is now built for tests as well. Polynomial tests are linked to icicle lib.	2024-04-14 16:31:49 +03:00
DmytroTym	0c340924eb	Baby bear Rust wrappers (#472 ) This PR introduces Rust wrappers for the baby bear finite field, including NTTs and vector operations (for both native and extension fields).	2024-04-14 18:23:43 +07:00
Ido Atlas	9d461be0bd	small fix	2024-04-14 11:58:24 +03:00
DmytroTym	c123940abb	Tests and benches for extension fields (#470 ) - C++ side tests and benchmarks refactored using templates; - Extension field tests and benchmarks added; - README added for benches; - Some small additions to extension field API for convenience;	2024-04-13 20:07:30 +07:00
nonam3e	af4ab88f3a	Feat/golang/v2 bindings (#468 ) ## Describe the changes This PR updates current golang bindings to icicle v2	2024-04-11 22:44:53 +07:00
nonam3e	1ae1fa892c	build templates	2024-04-11 14:25:57 +00:00
nonam3e	bf89f96dff	Update wrappers/golang/internal/generator/templates/ecntt_test.go.tmpl Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2024-04-11 19:44:03 +07:00
nonam3e	ebf9fc4740	fmt template	2024-04-11 11:31:17 +00:00
nonam3e	b56187452f	fmt	2024-04-11 11:17:28 +00:00
nonam3e	ba41d3e97b	merge uint64	2024-04-11 11:08:39 +00:00
nonam3e	7fe2a11941	fmt	2024-04-11 10:03:48 +00:00
nonam3e	5144c15845	fmt	2024-04-11 10:02:49 +00:00
nonam3e	013f1efa09	review fixes	2024-04-11 09:52:42 +00:00
Jeremy Felder	b2b3702b20	[BREAKING] Change golang field representation to uint64 (#400 ) ## Describe the changes This PR changes the field representation in Golang bindings to use uint64 limbs instead of uint32 for easier conversion with other libraries	2024-04-11 11:55:43 +03:00
ChickenLover	5b8e22953b	run releasedomain last and add assertion to test (#469 ) ## Describe the changes This PR... ## Linked Issues Resolves # --------- Co-authored-by: LeonHibnik <leon@ingonyama.com>	2024-04-11 13:26:27 +07:00
nonam3e	452aea71af	update golang readme	2024-04-10 23:17:23 +00:00
nonam3e	356b13f650	update CI	2024-04-10 23:05:46 +00:00
nonam3e	2952401e40	g2 ecntt packages	2024-04-10 23:00:59 +00:00
Ido Atlas	cf9579d4f9	separate single round	2024-04-10 15:02:28 +03:00
nonam3e	a50af2f06e	ecntt build tag	2024-04-09 20:39:13 +00:00
Ido Atlas	bdf4f1ee24	separate/unified option	2024-04-09 16:29:18 +03:00
ChickenLover	9af6bb5666	Merge branch 'main' into V2	2024-04-09 18:47:07 +07:00
ChickenLover	da31adefce	v2 new design (#443 ) Co-authored-by: DmytroTym <dmytrotym1@gmail.com> Co-authored-by: Otsar <122266060+Otsar-Raikou@users.noreply.github.com> Co-authored-by: VitaliiH <vhnatyk@gmail.com> Co-authored-by: release-bot <release-bot@ingonyama.com> Co-authored-by: Stas <spolonsky@icloud.com> Co-authored-by: Yuval Shekel <yshekel@gmail.com> Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com> Co-authored-by: ImmanuelSegol <3ditds@gmail.com> Co-authored-by: JimmyHongjichuan <45908291+JimmyHongjichuan@users.noreply.github.com> Co-authored-by: pierre <pierreuu@gmail.com> Co-authored-by: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com> Co-authored-by: nonam3e <timur@ingonyama.com> Co-authored-by: Vlad <88586482+vladfdp@users.noreply.github.com> Co-authored-by: LeonHibnik <leon@ingonyama.com>	2024-04-09 18:27:35 +07:00
Ido Atlas	9b39e771ea	double round separate kernels	2024-04-09 12:37:10 +03:00
Ido Atlas	828fc9c006	double round verified for 1 poly	2024-04-07 18:42:17 +03:00
Ido Atlas	53395312e3	add reference for double round	2024-04-07 14:42:01 +03:00
Ido Atlas	709e6a12e5	really verify 2 and 4	2024-04-03 13:27:55 +03:00
Ido Atlas	0e067b296f	adding templated version	2024-04-03 13:25:17 +03:00
Ido Atlas	b9409d2109	generalized reference	2024-04-03 12:57:44 +03:00
Ido Atlas	19fe6f6d5d	generalized verified for 1 and 3 polys	2024-04-03 12:46:35 +03:00
Ido Atlas	be4b636494	generalized	2024-04-02 13:31:20 +03:00
Ido Atlas	9a63f17e1b	ALL VERIFIED	2024-04-01 22:37:33 +03:00
Ido Atlas	425dd2c38d	fix alg1unified with new kernel	2024-04-01 22:01:02 +03:00
Ido Atlas	0c6fe543fa	new test vector	2024-04-01 20:05:26 +03:00
Ido Atlas	c5ed01b52c	alg3 simple works and verified	2024-04-01 16:32:43 +03:00
Ido Atlas	57ac6a13dc	inplace sum reduction	2024-03-31 14:29:22 +03:00
Ido Atlas	7833324d9c	suncheck 3 - first round passses	2024-03-31 12:24:04 +03:00
Ido Atlas	2111c3a91a	order change, alg1 unified works without bugs	2024-03-27 21:49:27 +02:00
Ido Atlas	009a17af11	before rewriting	2024-03-27 17:23:38 +02:00
Yuval Shekel	4b50103151	enable CI for V2 branch	2024-03-27 12:26:45 +02:00
Ido Atlas	e24f74c97a	adding test vecs	2024-03-26 15:59:38 +02:00
Ido Atlas	fa5064784f	update kernels for alg3	2024-03-24 17:42:44 +02:00
Ido Atlas	f0b86b9c06	updated kernels for product. problem with n>16 for unified	2024-03-21 16:56:09 +02:00
Ido Atlas	79719fcb0f	unified kernel works	2024-03-21 15:36:59 +02:00
hadaringonyama	792a2e5f97	updated verification env	2024-03-17 14:02:43 +02:00
hadaringonyama	f6fee8e201	sumcheck verified	2024-03-13 10:33:38 +02:00
hadaringonyama	f5492a97fb	bug fix	2024-03-12 16:28:31 +02:00
hadaringonyama	2ffda0ffac	accumulate function verified	2024-03-12 15:50:30 +02:00
hadaringonyama	2af9e44219	sumcheck design	2024-02-25 11:18:07 +02:00