vec ops compiles

poseidon compiles
polynomial compiles
2026-01-14 09:58:02 -05:00 · 2024-05-12 14:01:17 +03:00 · 2024-05-12 13:43:47 +03:00 · 2024-05-12 13:28:11 +03:00 · 2024-05-12 12:55:07 +03:00 · 2024-05-09 11:31:22 +03:00
67 changed files with 3219 additions and 1779 deletions
--- a/README.md
+++ b/README.md
@@ -119,6 +119,7 @@ This will ensure our custom hooks are run and will make it easier to follow our
 - [nonam3e](https://github.com/nonam3e), for adding Grumpkin curve support into ICICLE
 - [alxiong](https://github.com/alxiong), for adding warmup for CudaStream
 - [cyl19970726](https://github.com/cyl19970726), for updating go install source in Dockerfile
+- [PatStiles](https://github.com/PatStiles), for adding Stark252 field

 ## Help & Support

--- a/examples/c++/pedersen-commitment/example.cu
+++ b/examples/c++/pedersen-commitment/example.cu
@@ -88,7 +88,7 @@ void point_near_x(T x, affine_t *point) {
 }

 static int seed = 0;
-static HOST_INLINE T rand_host_seed()
+static T rand_host_seed()
  {
    std::mt19937_64 generator(seed++);
    std::uniform_int_distribution<unsigned> distribution;
--- a/icicle/include/curves/affine.cuh
+++ b/icicle/include/curves/affine.cuh
@@ -1,7 +1,7 @@
 #pragma once

-#include "gpu-utils/sharedmem.cuh"
-#include "gpu-utils/modifiers.cuh"
+#include "../gpu-utils/sharedmem.cuh"
+#include "../gpu-utils/modifiers.cuh"
 #include <iostream>

 template <class FF>
@@ -11,26 +11,26 @@ public:
  FF x;
  FF y;

-  static HOST_DEVICE_INLINE Affine neg(const Affine& point) { return {point.x, FF::neg(point.y)}; }
+  static Affine neg(const Affine& point) { return {point.x, FF::neg(point.y)}; }

-  static HOST_DEVICE_INLINE Affine zero() { return {FF::zero(), FF::zero()}; }
+  static Affine zero() { return {FF::zero(), FF::zero()}; }

-  static HOST_DEVICE_INLINE Affine to_montgomery(const Affine& point)
+  static Affine to_montgomery(const Affine& point)
  {
    return {FF::to_montgomery(point.x), FF::to_montgomery(point.y)};
  }

-  static HOST_DEVICE_INLINE Affine from_montgomery(const Affine& point)
+  static Affine from_montgomery(const Affine& point)
  {
    return {FF::from_montgomery(point.x), FF::from_montgomery(point.y)};
  }

-  friend HOST_DEVICE_INLINE bool operator==(const Affine& xs, const Affine& ys)
+  friend bool operator==(const Affine& xs, const Affine& ys)
  {
    return (xs.x == ys.x) && (xs.y == ys.y);
  }

-  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Affine& point)
+  friend std::ostream& operator<<(std::ostream& os, const Affine& point)
  {
    os << "x: " << point.x << "; y: " << point.y;
    return os;
@@ -39,9 +39,9 @@ public:

 template <class FF>
 struct SharedMemory<Affine<FF>> {
-  __device__ Affine<FF>* getPointer()
+  Affine<FF>* getPointer()
  {
-    extern __shared__ Affine<FF> s_affine_[];
+    Affine<FF> *s_affine_ = nullptr;
    return s_affine_;
  }
 };
--- a/icicle/include/curves/curve_config.cuh
+++ b/icicle/include/curves/curve_config.cuh
@@ -1,9 +1,9 @@
-#pragma once
+ #pragma once
 #ifndef CURVE_CONFIG_H
 #define CURVE_CONFIG_H

-#include "fields/id.h"
-#include "curves/projective.cuh"
+#include "../fields/id.h"
+#include "projective.cuh"

 /**
 * @namespace curve_config
@@ -12,23 +12,23 @@
 * with the `-DCURVE` env variable passed during build.
 */
 #if CURVE_ID == BN254
-#include "curves/params/bn254.cuh"
+#include "params/bn254.cuh"
 namespace curve_config = bn254;

 #elif CURVE_ID == BLS12_381
-#include "curves/params/bls12_381.cuh"
+#include "params/bls12_381.cuh"
 namespace curve_config = bls12_381;

 #elif CURVE_ID == BLS12_377
-#include "curves/params/bls12_377.cuh"
+#include "params/bls12_377.cuh"
 namespace curve_config = bls12_377;

 #elif CURVE_ID == BW6_761
-#include "curves/params/bw6_761.cuh"
+#include "params/bw6_761.cuh"
 namespace curve_config = bw6_761;

 #elif CURVE_ID == GRUMPKIN
-#include "curves/params/grumpkin.cuh"
+#include "params/grumpkin.cuh"
 namespace curve_config = grumpkin;
 #endif
 #endif
--- a/icicle/include/curves/params/bn254.cuh
+++ b/icicle/include/curves/params/bn254.cuh
@@ -2,13 +2,13 @@
 #ifndef BN254_PARAMS_H
 #define BN254_PARAMS_H

-#include "fields/storage.cuh"
+#include "../../fields/storage.cuh"

-#include "curves/macro.h"
-#include "curves/projective.cuh"
-#include "fields/snark_fields/bn254_base.cuh"
-#include "fields/snark_fields/bn254_scalar.cuh"
-#include "fields/quadratic_extension.cuh"
+#include "../macro.h"
+#include "../projective.cuh"
+#include "../../fields/snark_fields/bn254_base.cuh"
+#include "../../fields/snark_fields/bn254_scalar.cuh"
+#include "../../fields/quadratic_extension.cuh"

 namespace bn254 {
  // G1 and G2 generators
--- a/icicle/include/curves/projective.cuh
+++ b/icicle/include/curves/projective.cuh
@@ -1,7 +1,7 @@
 #pragma once

 #include "affine.cuh"
-#include "gpu-utils/sharedmem.cuh"
+#include "../gpu-utils/sharedmem.cuh"

 template <typename FF, class SCALAR_FF, const FF& B_VALUE, const FF& GENERATOR_X, const FF& GENERATOR_Y>
 class Projective
@@ -19,34 +19,34 @@ public:
  FF y;
  FF z;

-  static HOST_DEVICE_INLINE Projective zero() { return {FF::zero(), FF::one(), FF::zero()}; }
+  static Projective zero() { return {FF::zero(), FF::one(), FF::zero()}; }

-  static HOST_DEVICE_INLINE Affine<FF> to_affine(const Projective& point)
+  static Affine<FF> to_affine(const Projective& point)
  {
    FF denom = FF::inverse(point.z);
    return {point.x * denom, point.y * denom};
  }

-  static HOST_DEVICE_INLINE Projective from_affine(const Affine<FF>& point)
+  static Projective from_affine(const Affine<FF>& point)
  {
    return point == Affine<FF>::zero() ? zero() : Projective{point.x, point.y, FF::one()};
  }

-  static HOST_DEVICE_INLINE Projective to_montgomery(const Projective& point)
+  static Projective to_montgomery(const Projective& point)
  {
    return {FF::to_montgomery(point.x), FF::to_montgomery(point.y), FF::to_montgomery(point.z)};
  }

-  static HOST_DEVICE_INLINE Projective from_montgomery(const Projective& point)
+  static Projective from_montgomery(const Projective& point)
  {
    return {FF::from_montgomery(point.x), FF::from_montgomery(point.y), FF::from_montgomery(point.z)};
  }

-  static HOST_DEVICE_INLINE Projective generator() { return {GENERATOR_X, GENERATOR_Y, FF::one()}; }
+  static Projective generator() { return {GENERATOR_X, GENERATOR_Y, FF::one()}; }

-  static HOST_DEVICE_INLINE Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; }
+  static Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; }

-  static HOST_DEVICE_INLINE Projective dbl(const Projective& point)
+  static Projective dbl(const Projective& point)
  {
    const FF X = point.x;
    const FF Y = point.y;
@@ -74,7 +74,7 @@ public:
    return {X3, Y3, Z3};
  }

-  friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Projective& p2)
+  friend Projective operator+(Projective p1, const Projective& p2)
  {
    const FF X1 = p1.x;                                                                //                   < 2
    const FF Y1 = p1.y;                                                                //                   < 2
@@ -118,9 +118,9 @@ public:
    return {X3, Y3, Z3};
  }

-  friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Projective& p2) { return p1 + neg(p2); }
+  friend Projective operator-(Projective p1, const Projective& p2) { return p1 + neg(p2); }

-  friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Affine<FF>& p2)
+  friend Projective operator+(Projective p1, const Affine<FF>& p2)
  {
    const FF X1 = p1.x;                                                                //                   < 2
    const FF Y1 = p1.y;                                                                //                   < 2
@@ -163,12 +163,12 @@ public:
    return {X3, Y3, Z3};
  }

-  friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Affine<FF>& p2)
+  friend Projective operator-(Projective p1, const Affine<FF>& p2)
  {
    return p1 + Affine<FF>::neg(p2);
  }

-  friend HOST_DEVICE_INLINE Projective operator*(SCALAR_FF scalar, const Projective& point)
+  friend Projective operator*(SCALAR_FF scalar, const Projective& point)
  {
    Projective res = zero();
 #ifdef __CUDA_ARCH__
@@ -181,27 +181,27 @@ public:
    return res;
  }

-  friend HOST_DEVICE_INLINE Projective operator*(const Projective& point, SCALAR_FF scalar) { return scalar * point; }
+  friend Projective operator*(const Projective& point, SCALAR_FF scalar) { return scalar * point; }

-  friend HOST_DEVICE_INLINE bool operator==(const Projective& p1, const Projective& p2)
+  friend bool operator==(const Projective& p1, const Projective& p2)
  {
    return (p1.x * p2.z == p2.x * p1.z) && (p1.y * p2.z == p2.y * p1.z);
  }

-  friend HOST_DEVICE_INLINE bool operator!=(const Projective& p1, const Projective& p2) { return !(p1 == p2); }
+  friend bool operator!=(const Projective& p1, const Projective& p2) { return !(p1 == p2); }

-  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Projective& point)
+  friend std::ostream& operator<<(std::ostream& os, const Projective& point)
  {
    os << "Point { x: " << point.x << "; y: " << point.y << "; z: " << point.z << " }";
    return os;
  }

-  static HOST_DEVICE_INLINE bool is_zero(const Projective& point)
+  static bool is_zero(const Projective& point)
  {
    return point.x == FF::zero() && point.y != FF::zero() && point.z == FF::zero();
  }

-  static HOST_DEVICE_INLINE bool is_on_curve(const Projective& point)
+  static bool is_on_curve(const Projective& point)
  {
    if (is_zero(point)) return true;
    bool eq_holds =
@@ -210,7 +210,7 @@ public:
    return point.z != FF::zero() && eq_holds;
  }

-  static HOST_INLINE Projective rand_host()
+  static Projective rand_host()
  {
    SCALAR_FF rand_scalar = SCALAR_FF::rand_host();
    return rand_scalar * generator();
@@ -231,9 +231,9 @@ public:

 template <typename FF, class SCALAR_FF, const FF& B_VALUE, const FF& GENERATOR_X, const FF& GENERATOR_Y>
 struct SharedMemory<Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>> {
-  __device__ Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>* getPointer()
+  Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>* getPointer()
  {
-    extern __shared__ Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y> s_projective_[];
+    Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y> *s_projective_ = nullptr;
    return s_projective_;
  }
 };
--- a/icicle/include/fields/field.cuh
+++ b/icicle/include/fields/field.cuh
@@ -18,9 +18,9 @@

 #pragma once

-#include "gpu-utils/error_handler.cuh"
-#include "gpu-utils/modifiers.cuh"
-#include "gpu-utils/sharedmem.cuh"
+#include "../gpu-utils/error_handler.cuh"
+#include "../gpu-utils/modifiers.cuh"
+#include "../gpu-utils/sharedmem.cuh"
 #include "host_math.cuh"
 #include "ptx.cuh"
 #include "storage.cuh"
@@ -38,11 +38,11 @@ public:
  static constexpr unsigned TLC = CONFIG::limbs_count;
  static constexpr unsigned NBITS = CONFIG::modulus_bit_count;

-  static constexpr HOST_DEVICE_INLINE Field zero() { return Field{CONFIG::zero}; }
+  static constexpr Field zero() { return Field{CONFIG::zero}; }

-  static constexpr HOST_DEVICE_INLINE Field one() { return Field{CONFIG::one}; }
+  static constexpr Field one() { return Field{CONFIG::one}; }

-  static constexpr HOST_DEVICE_INLINE Field from(uint32_t value)
+  static constexpr Field from(uint32_t value)
  {
    storage<TLC> scalar;
    scalar.limbs[0] = value;
@@ -52,7 +52,7 @@ public:
    return Field{scalar};
  }

-  static HOST_INLINE Field omega(uint32_t logn)
+  static Field omega(uint32_t logn)
  {
    if (logn == 0) { return Field{CONFIG::one}; }

@@ -62,7 +62,7 @@ public:
    return Field{omega.storages[logn - 1]};
  }

-  static HOST_INLINE Field omega_inv(uint32_t logn)
+  static Field omega_inv(uint32_t logn)
  {
    if (logn == 0) { return Field{CONFIG::one}; }

@@ -74,7 +74,7 @@ public:
    return Field{omega_inv.storages[logn - 1]};
  }

-  static HOST_DEVICE_INLINE Field inv_log_size(uint32_t logn)
+  static Field inv_log_size(uint32_t logn)
  {
    if (logn == 0) { return Field{CONFIG::one}; }
 #ifndef __CUDA_ARCH__
@@ -91,7 +91,7 @@ public:
    return Field{inv.storages[logn - 1]};
  }

-  static constexpr HOST_INLINE unsigned get_omegas_count()
+  static constexpr unsigned get_omegas_count()
  {
    if constexpr (has_member_omegas_count<CONFIG>()) {
      return CONFIG::omegas_count;
@@ -113,45 +113,45 @@ public:
  /**
   * A new addition to the config file - \f$ 2^{32 \cdot num\_limbs} - p \f$.
   */
-  static constexpr HOST_DEVICE_INLINE ff_storage get_neg_modulus() { return CONFIG::neg_modulus; }
+  static constexpr ff_storage get_neg_modulus() { return CONFIG::neg_modulus; }

  /**
   * A new addition to the config file - the number of times to reduce in [reduce](@ref reduce) function.
   */
-  static constexpr HOST_DEVICE_INLINE unsigned num_of_reductions() { return CONFIG::num_of_reductions; }
+  static constexpr unsigned num_of_reductions() { return CONFIG::num_of_reductions; }

  static constexpr unsigned slack_bits = 32 * TLC - NBITS;

  struct Wide {
    ff_wide_storage limbs_storage;

-    static constexpr Field HOST_DEVICE_INLINE get_lower(const Wide& xs)
+    static constexpr Field get_lower(const Wide& xs)
    {
      Field out{};
 #ifdef __CUDA_ARCH__
-      UNROLL
+      
 #endif
      for (unsigned i = 0; i < TLC; i++)
        out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
      return out;
    }

-    static constexpr Field HOST_DEVICE_INLINE get_higher(const Wide& xs)
+    static constexpr Field get_higher(const Wide& xs)
    {
      Field out{};
 #ifdef __CUDA_ARCH__
-      UNROLL
+      
 #endif
      for (unsigned i = 0; i < TLC; i++)
        out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i + TLC];
      return out;
    }

-    static constexpr Field HOST_DEVICE_INLINE get_higher_with_slack(const Wide& xs)
+    static constexpr Field get_higher_with_slack(const Wide& xs)
    {
      Field out{};
 #ifdef __CUDA_ARCH__
-      UNROLL
+      
 #endif
      for (unsigned i = 0; i < TLC; i++) {
 #ifdef __CUDA_ARCH__
@@ -166,7 +166,7 @@ public:
    }

    template <unsigned REDUCTION_SIZE = 1>
-    static constexpr HOST_DEVICE_INLINE Wide sub_modulus_squared(const Wide& xs)
+    static constexpr Wide sub_modulus_squared(const Wide& xs)
    {
      if (REDUCTION_SIZE == 0) return xs;
      const ff_wide_storage modulus = get_modulus_squared<REDUCTION_SIZE>();
@@ -175,7 +175,7 @@ public:
    }

    template <unsigned MODULUS_MULTIPLE = 1>
-    static constexpr HOST_DEVICE_INLINE Wide neg(const Wide& xs)
+    static constexpr Wide neg(const Wide& xs)
    {
      const ff_wide_storage modulus = get_modulus_squared<MODULUS_MULTIPLE>();
      Wide rs = {};
@@ -183,14 +183,14 @@ public:
      return rs;
    }

-    friend HOST_DEVICE_INLINE Wide operator+(Wide xs, const Wide& ys)
+    friend Wide operator+(Wide xs, const Wide& ys)
    {
      Wide rs = {};
      add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
      return sub_modulus_squared<1>(rs);
    }

-    friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys)
+    friend Wide operator-(Wide xs, const Wide& ys)
    {
      Wide rs = {};
      uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
@@ -203,7 +203,7 @@ public:

  // return modulus multiplied by 1, 2 or 4
  template <unsigned MULTIPLIER = 1>
-  static constexpr HOST_DEVICE_INLINE ff_storage get_modulus()
+  static constexpr ff_storage get_modulus()
  {
    switch (MULTIPLIER) {
    case 1:
@@ -218,17 +218,17 @@ public:
  }

  template <unsigned MULTIPLIER = 1>
-  static constexpr HOST_DEVICE_INLINE ff_wide_storage modulus_wide()
+  static constexpr ff_wide_storage modulus_wide()
  {
    return CONFIG::modulus_wide;
  }

  // return m
-  static constexpr HOST_DEVICE_INLINE ff_storage get_m() { return CONFIG::m; }
+  static constexpr ff_storage get_m() { return CONFIG::m; }

  // return modulus^2, helpful for ab +/- cd
  template <unsigned MULTIPLIER = 1>
-  static constexpr HOST_DEVICE_INLINE ff_wide_storage get_modulus_squared()
+  static constexpr ff_wide_storage get_modulus_squared()
  {
    switch (MULTIPLIER) {
    case 1:
@@ -243,7 +243,7 @@ public:
  }

  template <bool SUBTRACT, bool CARRY_OUT>
-  static constexpr DEVICE_INLINE uint32_t
+  static constexpr uint32_t
  add_sub_u32_device(const uint32_t* x, const uint32_t* y, uint32_t* r, size_t n = (TLC >> 1))
  {
    r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]);
@@ -258,7 +258,7 @@ public:

  // add or subtract limbs
  template <bool SUBTRACT, bool CARRY_OUT>
-  static constexpr DEVICE_INLINE uint32_t
+  static constexpr uint32_t
  add_sub_limbs_device(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
  {
    const uint32_t* x = xs.limbs;
@@ -268,7 +268,7 @@ public:
  }

  template <bool SUBTRACT, bool CARRY_OUT>
-  static constexpr DEVICE_INLINE uint32_t
+  static constexpr uint32_t
  add_sub_limbs_device(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs)
  {
    const uint32_t* x = xs.limbs;
@@ -278,7 +278,7 @@ public:
  }

  template <bool SUBTRACT, bool CARRY_OUT>
-  static constexpr HOST_INLINE uint32_t add_sub_limbs_host(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
+  static constexpr uint32_t add_sub_limbs_host(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
  {
    const uint32_t* x = xs.limbs;
    const uint32_t* y = ys.limbs;
@@ -291,7 +291,7 @@ public:
  }

  template <bool SUBTRACT, bool CARRY_OUT>
-  static constexpr HOST_INLINE uint32_t
+  static constexpr uint32_t
  add_sub_limbs_host(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs)
  {
    const uint32_t* x = xs.limbs;
@@ -305,7 +305,7 @@ public:
  }

  template <bool CARRY_OUT, typename T>
-  static constexpr HOST_DEVICE_INLINE uint32_t add_limbs(const T& xs, const T& ys, T& rs)
+  static constexpr uint32_t add_limbs(const T& xs, const T& ys, T& rs)
  {
 #ifdef __CUDA_ARCH__
    return add_sub_limbs_device<false, CARRY_OUT>(xs, ys, rs);
@@ -315,7 +315,7 @@ public:
  }

  template <bool CARRY_OUT, typename T>
-  static constexpr HOST_DEVICE_INLINE uint32_t sub_limbs(const T& xs, const T& ys, T& rs)
+  static constexpr uint32_t sub_limbs(const T& xs, const T& ys, T& rs)
  {
 #ifdef __CUDA_ARCH__
    return add_sub_limbs_device<true, CARRY_OUT>(xs, ys, rs);
@@ -324,18 +324,18 @@ public:
 #endif
  }

-  static DEVICE_INLINE void mul_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static void mul_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
-    UNROLL
+    
    for (size_t i = 0; i < n; i += 2) {
      acc[i] = ptx::mul_lo(a[i], bi);
      acc[i + 1] = ptx::mul_hi(a[i], bi);
    }
  }

-  static DEVICE_INLINE void mul_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t start_i = 0)
+  static void mul_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t start_i = 0)
  {
-    UNROLL
+    
    for (size_t i = start_i; i < n; i += 2) {
      acc[i] = ptx::mul_lo(a[i], bi);
      acc[i + 1] = ptx::mul_hi(a[i], bi);
@@ -343,14 +343,14 @@ public:
  }

  template <bool CARRY_IN = false>
-  static DEVICE_INLINE void
+  static void
  cmad_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, uint32_t optional_carry = 0)
  {
    if (CARRY_IN) ptx::add_cc(UINT32_MAX, optional_carry);
    acc[0] = CARRY_IN ? ptx::madc_lo_cc(a[0], bi, acc[0]) : ptx::mad_lo_cc(a[0], bi, acc[0]);
    acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]);

-    UNROLL
+    
    for (size_t i = 2; i < n; i += 2) {
      acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
      acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
@@ -358,7 +358,7 @@ public:
  }

  template <bool EVEN_PHASE>
-  static DEVICE_INLINE void cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static void cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
    if (EVEN_PHASE) {
      acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
@@ -367,14 +367,14 @@ public:
      acc[1] = ptx::mad_hi_cc(a[0], bi, acc[1]);
    }

-    UNROLL
+    
    for (size_t i = 2; i < n; i += 2) {
      acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
      acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
    }
  }

-  static DEVICE_INLINE void cmad_n_lsb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static void cmad_n_lsb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
    if (n > 1)
      acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
@@ -382,7 +382,7 @@ public:
      acc[0] = ptx::mad_lo(a[0], bi, acc[0]);

    size_t i;
-    UNROLL
+    
    for (i = 1; i < n - 1; i += 2) {
      acc[i] = ptx::madc_hi_cc(a[i - 1], bi, acc[i]);
      if (i == n - 2)
@@ -394,7 +394,7 @@ public:
  }

  template <bool CARRY_OUT = false, bool CARRY_IN = false>
-  static DEVICE_INLINE uint32_t mad_row(
+  static uint32_t mad_row(
    uint32_t* odd,
    uint32_t* even,
    const uint32_t* a,
@@ -419,7 +419,7 @@ public:
  }

  template <bool EVEN_PHASE>
-  static DEVICE_INLINE void mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static void mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
    cmad_n_msb<!EVEN_PHASE>(odd, EVEN_PHASE ? a : (a + 1), bi, n - 2);
    odd[EVEN_PHASE ? (n - 1) : (n - 2)] = ptx::madc_lo_cc(a[n - 1], bi, 0);
@@ -428,7 +428,7 @@ public:
    odd[EVEN_PHASE ? n : (n - 1)] = ptx::addc(odd[EVEN_PHASE ? n : (n - 1)], 0);
  }

-  static DEVICE_INLINE void mad_row_lsb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static void mad_row_lsb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
    // bi here is constant so we can do a compile-time check for zero (which does happen once for bls12-381 scalar field
    // modulus)
@@ -439,12 +439,12 @@ public:
    return;
  }

-  static DEVICE_INLINE uint32_t
+  static uint32_t
  mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, uint32_t* extra, size_t n = (TLC >> 1))
  {
    acc[0] = ptx::mad_lo_cc(a[0], bi, extra[0]);

-    UNROLL
+    
    for (size_t i = 1; i < n - 1; i += 2) {
      acc[i] = ptx::madc_hi_cc(a[i - 1], bi, extra[i]);
      acc[i + 1] = ptx::madc_lo_cc(a[i + 1], bi, extra[i + 1]);
@@ -467,19 +467,19 @@ public:
   * \cdot b_0}{2^{32}}} + \dots + \floor{\frac{a_0 \cdot b_{TLC - 2}}{2^{32}}}) \leq 2^{64} + 2\cdot 2^{96} + \dots +
   * (TLC - 2) \cdot 2^{32(TLC - 1)} + (TLC - 1) \cdot 2^{32(TLC - 1)} \leq 2(TLC - 1) \cdot 2^{32(TLC - 1)}\f$.
   */
-  static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  static void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
  {
    if constexpr (TLC > 1) {
      const uint32_t* a = as.limbs;
      const uint32_t* b = bs.limbs;
      uint32_t* even = rs.limbs;
-      __align__(16) uint32_t odd[2 * TLC - 2];
+      uint32_t odd[2 * TLC - 2];

      even[TLC - 1] = ptx::mul_hi(a[TLC - 2], b[0]);
      odd[TLC - 2] = ptx::mul_lo(a[TLC - 1], b[0]);
      odd[TLC - 1] = ptx::mul_hi(a[TLC - 1], b[0]);
      size_t i;
-      UNROLL
+      
      for (i = 2; i < TLC - 1; i += 2) {
        mad_row_msb<true>(&even[TLC - 2], &odd[TLC - 2], &a[TLC - i - 1], b[i - 1], i + 1);
        mad_row_msb<false>(&odd[TLC - 2], &even[TLC - 2], &a[TLC - i - 2], b[i], i + 2);
@@ -504,7 +504,7 @@ public:
   * is excluded if \f$ i + j > TLC - 1 \f$ and only the lower half is included if \f$ i + j = TLC - 1 \f$. All other
   * limb products are included.
   */
-  static DEVICE_INLINE void
+  static void
  multiply_and_add_lsb_neg_modulus_raw_device(const ff_storage& as, ff_storage& cs, ff_storage& rs)
  {
    ff_storage bs = get_neg_modulus();
@@ -514,7 +514,7 @@ public:
    uint32_t* even = rs.limbs;

    if constexpr (TLC > 2) {
-      __align__(16) uint32_t odd[TLC - 1];
+      uint32_t odd[TLC - 1];
      size_t i;
      // `b[0]` is \f$ 2^{32} \f$ minus the last limb of prime modulus. Because most scalar (and some base) primes
      // are necessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to
@@ -528,7 +528,6 @@ public:
        mul_n(odd, a + 1, b[0], TLC - 1);
      }
      mad_row_lsb(&even[2], &odd[0], a, b[1], TLC - 1);
-      UNROLL
      for (i = 2; i < TLC - 1; i += 2) {
        mad_row_lsb(&odd[i], &even[i], a, b[i], TLC - i);
        mad_row_lsb(&even[i + 2], &odd[i], a, b[i + 1], TLC - i - 1);
@@ -558,15 +557,15 @@ public:
   * that the top bit of \f$ a_{hi} \f$ and \f$ b_{hi} \f$ are unset. This ensures correctness by allowing to keep the
   * result inside TLC limbs and ignore the carries from the highest limb.
   */
-  static DEVICE_INLINE void
+  static void
  multiply_and_add_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even, uint32_t* in1, uint32_t* in2)
  {
-    __align__(16) uint32_t odd[TLC - 2];
+    uint32_t odd[TLC - 2];
    uint32_t first_row_carry = mul_n_and_add(even, a, b[0], in1);
    uint32_t carry = mul_n_and_add(odd, a + 1, b[0], &in2[1]);

    size_t i;
-    UNROLL
+    
    for (i = 2; i < ((TLC >> 1) - 1); i += 2) {
      carry = mad_row<true, false>(
        &even[i], &odd[i - 2], a, b[i - 1], TLC >> 1, in1[(TLC >> 1) + i - 2], in1[(TLC >> 1) + i - 1], carry);
@@ -587,15 +586,15 @@ public:
   * This method multiplies `a` and `b` and writes the result into `even`. It assumes that `a` and `b` are TLC/2 limbs
   * long. The usual schoolbook algorithm is used.
   */
-  static DEVICE_INLINE void multiply_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even)
+  static void multiply_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even)
  {
-    __align__(16) uint32_t odd[TLC - 2];
+    uint32_t odd[TLC - 2];
    mul_n(even, a, b[0], TLC >> 1);
    mul_n(odd, a + 1, b[0], TLC >> 1);
    mad_row(&even[2], &odd[0], a, b[1], TLC >> 1);

    size_t i;
-    UNROLL
+    
    for (i = 2; i < ((TLC >> 1) - 1); i += 2) {
      mad_row(&odd[i], &even[i], a, b[i], TLC >> 1);
      mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC >> 1);
@@ -614,7 +613,7 @@ public:
   * with so far. This method implements [subtractive
   * Karatsuba](https://en.wikipedia.org/wiki/Karatsuba_algorithm#Implementation).
   */
-  static DEVICE_INLINE void multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  static void multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
  {
    const uint32_t* a = as.limbs;
    const uint32_t* b = bs.limbs;
@@ -624,8 +623,8 @@ public:
      // write the results into `r`.
      multiply_short_raw_device(a, b, r);
      multiply_short_raw_device(&a[TLC >> 1], &b[TLC >> 1], &r[TLC]);
-      __align__(16) uint32_t middle_part[TLC];
-      __align__(16) uint32_t diffs[TLC];
+      uint32_t middle_part[TLC];
+      uint32_t diffs[TLC];
      // Differences of halves \f$ a_{hi} - a_{lo}; b_{lo} - b_{hi} \$f are written into `diffs`, signs written to
      // `carry1` and `carry2`.
      uint32_t carry1 = add_sub_u32_device<true, true>(&a[TLC >> 1], a, diffs);
@@ -644,7 +643,7 @@ public:
      for (size_t i = TLC + (TLC >> 1); i < 2 * TLC; i++)
        r[i] = ptx::addc_cc(r[i], 0);
    } else if (TLC == 2) {
-      __align__(8) uint32_t odd[2];
+      uint32_t odd[2];
      r[0] = ptx::mul_lo(a[0], b[0]);
      r[1] = ptx::mul_hi(a[0], b[0]);
      r[2] = ptx::mul_lo(a[1], b[1]);
@@ -662,7 +661,7 @@ public:
    }
  }

-  static HOST_INLINE void multiply_raw_host(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  static void multiply_raw_host(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
  {
    const uint32_t* a = as.limbs;
    const uint32_t* b = bs.limbs;
@@ -675,7 +674,7 @@ public:
    }
  }

-  static HOST_DEVICE_INLINE void multiply_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  static void multiply_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
  {
 #ifdef __CUDA_ARCH__
    return multiply_raw_device(as, bs, rs);
@@ -684,7 +683,7 @@ public:
 #endif
  }

-  static HOST_DEVICE_INLINE void
+  static void
  multiply_and_add_lsb_neg_modulus_raw(const ff_storage& as, ff_storage& cs, ff_storage& rs)
  {
 #ifdef __CUDA_ARCH__
@@ -697,7 +696,7 @@ public:
 #endif
  }

-  static HOST_DEVICE_INLINE void multiply_msb_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  static void multiply_msb_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
  {
 #ifdef __CUDA_ARCH__
    return multiply_msb_raw_device(as, bs, rs);
@@ -709,9 +708,9 @@ public:
 public:
  ff_storage limbs_storage;

-  HOST_DEVICE_INLINE uint32_t* export_limbs() { return (uint32_t*)limbs_storage.limbs; }
+  uint32_t* export_limbs() { return (uint32_t*)limbs_storage.limbs; }

-  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
+  unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
  {
    const uint32_t limb_lsb_idx = (digit_num * digit_width) / 32;
    const uint32_t shift_bits = (digit_num * digit_width) % 32;
@@ -723,7 +722,7 @@ public:
    return rv;
  }

-  static HOST_INLINE Field rand_host()
+  static Field rand_host()
  {
    std::random_device rd;
    std::mt19937_64 generator(rd());
@@ -743,7 +742,7 @@ public:
  }

  template <unsigned REDUCTION_SIZE = 1>
-  static constexpr HOST_DEVICE_INLINE Field sub_modulus(const Field& xs)
+  static constexpr Field sub_modulus(const Field& xs)
  {
    if (REDUCTION_SIZE == 0) return xs;
    const ff_storage modulus = get_modulus<REDUCTION_SIZE>();
@@ -764,14 +763,14 @@ public:
    return os;
  }

-  friend HOST_DEVICE_INLINE Field operator+(Field xs, const Field& ys)
+  friend Field operator+(Field xs, const Field& ys)
  {
    Field rs = {};
    add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
    return sub_modulus<1>(rs);
  }

-  friend HOST_DEVICE_INLINE Field operator-(Field xs, const Field& ys)
+  friend Field operator-(Field xs, const Field& ys)
  {
    Field rs = {};
    uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
@@ -782,7 +781,7 @@ public:
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE Wide mul_wide(const Field& xs, const Field& ys)
+  static constexpr Wide mul_wide(const Field& xs, const Field& ys)
  {
    Wide rs = {};
    multiply_raw(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
@@ -811,7 +810,7 @@ public:
   * will cause only 1 reduction to be performed.
   */
  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE Field reduce(const Wide& xs)
+  static constexpr Field reduce(const Wide& xs)
  {
    // `xs` is left-shifted by `2 * slack_bits` and higher half is written to `xs_hi`
    Field xs_hi = Wide::get_higher_with_slack(xs);
@@ -836,19 +835,19 @@ public:
    return r;
  }

-  friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys)
+  friend Field operator*(const Field& xs, const Field& ys)
  {
    Wide xy = mul_wide(xs, ys); // full mult
    return reduce(xy);          // reduce mod p
  }

-  friend HOST_DEVICE_INLINE bool operator==(const Field& xs, const Field& ys)
+  friend bool operator==(const Field& xs, const Field& ys)
  {
 #ifdef __CUDA_ARCH__
    const uint32_t* x = xs.limbs_storage.limbs;
    const uint32_t* y = ys.limbs_storage.limbs;
    uint32_t limbs_or = x[0] ^ y[0];
-    UNROLL
+    
    for (unsigned i = 1; i < TLC; i++)
      limbs_or |= x[i] ^ y[i];
    return limbs_or == 0;
@@ -859,15 +858,15 @@ public:
 #endif
  }

-  friend HOST_DEVICE_INLINE bool operator!=(const Field& xs, const Field& ys) { return !(xs == ys); }
+  friend bool operator!=(const Field& xs, const Field& ys) { return !(xs == ys); }

  template <const Field& multiplier>
-  static HOST_DEVICE_INLINE Field mul_const(const Field& xs)
+  static Field mul_const(const Field& xs)
  {
    Field mul = multiplier;
    static bool is_u32 = true;
 #ifdef __CUDA_ARCH__
-    UNROLL
+    
 #endif
    for (unsigned i = 1; i < TLC; i++)
      is_u32 &= (mul.limbs_storage.limbs[i] == 0);
@@ -877,13 +876,13 @@ public:
  }

  template <uint32_t multiplier, class T, unsigned REDUCTION_SIZE = 1>
-  static constexpr HOST_DEVICE_INLINE T mul_unsigned(const T& xs)
+  static constexpr T mul_unsigned(const T& xs)
  {
    T rs = {};
    T temp = xs;
    bool is_zero = true;
 #ifdef __CUDA_ARCH__
-    UNROLL
+    
 #endif
    for (unsigned i = 0; i < 32; i++) {
      if (multiplier & (1 << i)) {
@@ -897,28 +896,28 @@ public:
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE Wide sqr_wide(const Field& xs)
+  static constexpr Wide sqr_wide(const Field& xs)
  {
    // TODO: change to a more efficient squaring
    return mul_wide<MODULUS_MULTIPLE>(xs, xs);
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE Field sqr(const Field& xs)
+  static constexpr Field sqr(const Field& xs)
  {
    // TODO: change to a more efficient squaring
    return xs * xs;
  }

-  static constexpr HOST_DEVICE_INLINE Field to_montgomery(const Field& xs) { return xs * Field{CONFIG::montgomery_r}; }
+  static constexpr Field to_montgomery(const Field& xs) { return xs * Field{CONFIG::montgomery_r}; }

-  static constexpr HOST_DEVICE_INLINE Field from_montgomery(const Field& xs)
+  static constexpr Field from_montgomery(const Field& xs)
  {
    return xs * Field{CONFIG::montgomery_r_inv};
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE Field neg(const Field& xs)
+  static constexpr Field neg(const Field& xs)
  {
    const ff_storage modulus = get_modulus<MODULUS_MULTIPLE>();
    Field rs = {};
@@ -928,14 +927,14 @@ public:

  // Assumes the number is even!
  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE Field div2(const Field& xs)
+  static constexpr Field div2(const Field& xs)
  {
    const uint32_t* x = xs.limbs_storage.limbs;
    Field rs = {};
    uint32_t* r = rs.limbs_storage.limbs;
    if constexpr (TLC > 1) {
 #ifdef __CUDA_ARCH__
-      UNROLL
+      
 #endif
      for (unsigned i = 0; i < TLC - 1; i++) {
 #ifdef __CUDA_ARCH__
@@ -949,18 +948,18 @@ public:
    return sub_modulus<MODULUS_MULTIPLE>(rs);
  }

-  static constexpr HOST_DEVICE_INLINE bool lt(const Field& xs, const Field& ys)
+  static constexpr bool lt(const Field& xs, const Field& ys)
  {
    ff_storage dummy = {};
    uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, dummy);
    return carry;
  }

-  static constexpr HOST_DEVICE_INLINE bool is_odd(const Field& xs) { return xs.limbs_storage.limbs[0] & 1; }
+  static constexpr bool is_odd(const Field& xs) { return xs.limbs_storage.limbs[0] & 1; }

-  static constexpr HOST_DEVICE_INLINE bool is_even(const Field& xs) { return ~xs.limbs_storage.limbs[0] & 1; }
+  static constexpr bool is_even(const Field& xs) { return ~xs.limbs_storage.limbs[0] & 1; }

-  static constexpr HOST_DEVICE_INLINE Field inverse(const Field& xs)
+  static constexpr Field inverse(const Field& xs)
  {
    if (xs == zero()) return zero();
    constexpr Field one = Field{CONFIG::one};
@@ -1007,9 +1006,9 @@ struct std::hash<Field<CONFIG>> {

 template <class CONFIG>
 struct SharedMemory<Field<CONFIG>> {
-  __device__ Field<CONFIG>* getPointer()
+  Field<CONFIG>* getPointer()
  {
-    extern __shared__ Field<CONFIG> s_scalar_[];
+    Field<CONFIG> *s_scalar_;
    return s_scalar_;
  }
 };
--- a/icicle/include/fields/field_config.cuh
+++ b/icicle/include/fields/field_config.cuh
@@ -2,8 +2,8 @@
 #ifndef FIELD_CONFIG_H
 #define FIELD_CONFIG_H

-#include "fields/id.h"
-#include "fields/field.cuh"
+#include "id.h"
+#include "field.cuh"

 /**
 * @namespace field_config
@@ -11,27 +11,27 @@
 * with the `-DFIELD` env variable passed during build.
 */
 #if FIELD_ID == BN254
-#include "fields/snark_fields/bn254_scalar.cuh"
+#include "snark_fields/bn254_scalar.cuh"
 namespace field_config = bn254;
 #elif FIELD_ID == BLS12_381
-#include "fields/snark_fields/bls12_381_scalar.cuh"
+#include "snark_fields/bls12_381_scalar.cuh"
 using bls12_381::fp_config;
 namespace field_config = bls12_381;
 #elif FIELD_ID == BLS12_377
-#include "fields/snark_fields/bls12_377_scalar.cuh"
+#include "snark_fields/bls12_377_scalar.cuh"
 namespace field_config = bls12_377;
 #elif FIELD_ID == BW6_761
-#include "fields/snark_fields/bw6_761_scalar.cuh"
+#include "snark_fields/bw6_761_scalar.cuh"
 namespace field_config = bw6_761;
 #elif FIELD_ID == GRUMPKIN
-#include "fields/snark_fields/grumpkin_scalar.cuh"
+#include "snark_fields/grumpkin_scalar.cuh"
 namespace field_config = grumpkin;

 #elif FIELD_ID == BABY_BEAR
-#include "fields/stark_fields/babybear.cuh"
+#include "stark_fields/babybear.cuh"
 namespace field_config = babybear;
 #elif FIELD_ID == STARK_252
-#include "fields/stark_fields/stark252.cuh"
+#include "stark_fields/stark252.cuh"
 namespace field_config = stark252;
 #endif

--- a/icicle/include/fields/host_math.cuh
+++ b/icicle/include/fields/host_math.cuh
@@ -3,98 +3,97 @@
 #define HOST_MATH_H

 #include <cstdint>
-#include <cuda_runtime.h>
-#include "gpu-utils/modifiers.cuh"
+#include "../gpu-utils/modifiers.cuh"
 namespace host_math {

-  // return x + y with uint32_t operands
-  static __host__ uint32_t add(const uint32_t x, const uint32_t y) { return x + y; }
+ // return x + y with uint32_t operands
+ static uint32_t add(const uint32_t x, const uint32_t y) { return x + y; }

-  // return x + y + carry with uint32_t operands
-  static __host__ uint32_t addc(const uint32_t x, const uint32_t y, const uint32_t carry) { return x + y + carry; }
+ // return x + y + carry with uint32_t operands
+ static uint32_t addc(const uint32_t x, const uint32_t y, const uint32_t carry) { return x + y + carry; }

-  // return x + y and carry out with uint32_t operands
-  static __host__ uint32_t add_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
+ // return x + y and carry out with uint32_t operands
+ static uint32_t add_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
+ {
+  uint32_t result;
+  result = x + y;
+  carry = x > result;
+  return result;
+ }
+
+ // return x + y + carry and carry out with uint32_t operands
+ static uint32_t addc_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
+ {
+  const uint32_t result = x + y + carry;
+  carry = carry && x >= result || !carry && x > result;
+  return result;
+ }
+
+ // return x - y with uint32_t operands
+ static uint32_t sub(const uint32_t x, const uint32_t y) { return x - y; }
+
+ // 	return x - y - borrow with uint32_t operands
+ static uint32_t subc(const uint32_t x, const uint32_t y, const uint32_t borrow) { return x - y - borrow; }
+
+ //	return x - y and borrow out with uint32_t operands
+ static uint32_t sub_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
+ {
+  uint32_t result;
+  result = x - y;
+  borrow = x < result;
+  return result;
+ }
+
+ //	return x - y - borrow and borrow out with uint32_t operands
+ static uint32_t subc_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
+ {
+  const uint32_t result = x - y - borrow;
+  borrow = borrow && x <= result || !borrow && x < result;
+  return result;
+ }
+
+ // return x * y + z + carry and carry out with uint32_t operands
+ static uint32_t madc_cc(const uint32_t x, const uint32_t y, const uint32_t z, uint32_t& carry)
+ {
+  uint32_t result;
+  uint64_t r = static_cast<uint64_t>(x) * y + z + carry;
+  carry = (uint32_t)(r >> 32);
+  result = r & 0xffffffff;
+  return result;
+ }
+
+ template <unsigned OPS_COUNT = UINT32_MAX, bool CARRY_IN = false, bool CARRY_OUT = false>
+ struct carry_chain {
+  unsigned index;
+
+  constexpr  carry_chain() : index(0) {}
+
+   uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
  {
-    uint32_t result;
-    result = x + y;
-    carry = x > result;
-    return result;
+   index++;
+   if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
+    return host_math::add(x, y);
+   else if (index == 1 && !CARRY_IN)
+    return host_math::add_cc(x, y, carry);
+   else if (index < OPS_COUNT || CARRY_OUT)
+    return host_math::addc_cc(x, y, carry);
+   else
+    return host_math::addc(x, y, carry);
  }

-  // return x + y + carry and carry out  with uint32_t operands
-  static __host__ uint32_t addc_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
+   uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
  {
-    const uint32_t result = x + y + carry;
-    carry = carry && x >= result || !carry && x > result;
-    return result;
+   index++;
+   if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
+    return host_math::sub(x, y);
+   else if (index == 1 && !CARRY_IN)
+    return host_math::sub_cc(x, y, carry);
+   else if (index < OPS_COUNT || CARRY_OUT)
+    return host_math::subc_cc(x, y, carry);
+   else
+    return host_math::subc(x, y, carry);
  }
-
-  // return x - y with uint32_t operands
-  static __host__ uint32_t sub(const uint32_t x, const uint32_t y) { return x - y; }
-
-  // 	return x - y - borrow with uint32_t operands
-  static __host__ uint32_t subc(const uint32_t x, const uint32_t y, const uint32_t borrow) { return x - y - borrow; }
-
-  //	return x - y and borrow out with uint32_t operands
-  static __host__ uint32_t sub_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
-  {
-    uint32_t result;
-    result = x - y;
-    borrow = x < result;
-    return result;
-  }
-
-  //	return x - y - borrow and borrow out with uint32_t operands
-  static __host__ uint32_t subc_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
-  {
-    const uint32_t result = x - y - borrow;
-    borrow = borrow && x <= result || !borrow && x < result;
-    return result;
-  }
-
-  // return x * y + z + carry and carry out with uint32_t operands
-  static __host__ uint32_t madc_cc(const uint32_t x, const uint32_t y, const uint32_t z, uint32_t& carry)
-  {
-    uint32_t result;
-    uint64_t r = static_cast<uint64_t>(x) * y + z + carry;
-    carry = (uint32_t)(r >> 32);
-    result = r & 0xffffffff;
-    return result;
-  }
-
-  template <unsigned OPS_COUNT = UINT32_MAX, bool CARRY_IN = false, bool CARRY_OUT = false>
-  struct carry_chain {
-    unsigned index;
-
-    constexpr HOST_INLINE carry_chain() : index(0) {}
-
-    HOST_INLINE uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
-    {
-      index++;
-      if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
-        return host_math::add(x, y);
-      else if (index == 1 && !CARRY_IN)
-        return host_math::add_cc(x, y, carry);
-      else if (index < OPS_COUNT || CARRY_OUT)
-        return host_math::addc_cc(x, y, carry);
-      else
-        return host_math::addc(x, y, carry);
-    }
-
-    HOST_INLINE uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
-    {
-      index++;
-      if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
-        return host_math::sub(x, y);
-      else if (index == 1 && !CARRY_IN)
-        return host_math::sub_cc(x, y, carry);
-      else if (index < OPS_COUNT || CARRY_OUT)
-        return host_math::subc_cc(x, y, carry);
-      else
-        return host_math::subc(x, y, carry);
-    }
-  };
+ };
 } // namespace host_math

 #endif
--- a/icicle/include/fields/ptx.cuh
+++ b/icicle/include/fields/ptx.cuh
@@ -1,139 +1,119 @@
 #pragma once
 #include <cstdint>
-#include <cuda_runtime.h>

 namespace ptx {

-  __device__ __forceinline__ uint32_t add(const uint32_t x, const uint32_t y)
+   uint32_t add(const uint32_t x, const uint32_t y)
  {
-    uint32_t result;
-    asm("add.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t add_cc(const uint32_t x, const uint32_t y)
+   uint32_t add_cc(const uint32_t x, const uint32_t y)
  {
-    uint32_t result;
-    asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t addc(const uint32_t x, const uint32_t y)
+   uint32_t addc(const uint32_t x, const uint32_t y)
  {
-    uint32_t result;
-    asm volatile("addc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t addc_cc(const uint32_t x, const uint32_t y)
+   uint32_t addc_cc(const uint32_t x, const uint32_t y)
  {
-    uint32_t result;
-    asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t sub(const uint32_t x, const uint32_t y)
+   uint32_t sub(const uint32_t x, const uint32_t y)
  {
-    uint32_t result;
-    asm("sub.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t sub_cc(const uint32_t x, const uint32_t y)
+   uint32_t sub_cc(const uint32_t x, const uint32_t y)
  {
-    uint32_t result;
-    asm volatile("sub.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+     uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t subc(const uint32_t x, const uint32_t y)
+   uint32_t subc(const uint32_t x, const uint32_t y)
  {
-    uint32_t result;
-    asm volatile("subc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+     uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t subc_cc(const uint32_t x, const uint32_t y)
+   uint32_t subc_cc(const uint32_t x, const uint32_t y)
  {
-    uint32_t result;
-    asm volatile("subc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+     uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t mul_lo(const uint32_t x, const uint32_t y)
+   uint32_t mul_lo(const uint32_t x, const uint32_t y)
  {
-    uint32_t result;
-    asm("mul.lo.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+     uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t mul_hi(const uint32_t x, const uint32_t y)
+   uint32_t mul_hi(const uint32_t x, const uint32_t y)
  {
-    uint32_t result;
-    asm("mul.hi.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+     uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t mad_lo(const uint32_t x, const uint32_t y, const uint32_t z)
+   uint32_t mad_lo(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-    uint32_t result;
-    asm("mad.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+     uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t mad_hi(const uint32_t x, const uint32_t y, const uint32_t z)
+   uint32_t mad_hi(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-    uint32_t result;
-    asm("mad.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+     uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t mad_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+   uint32_t mad_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-    uint32_t result;
-    asm volatile("mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+     uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t mad_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+   uint32_t mad_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-    uint32_t result;
-    asm volatile("mad.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+     uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t madc_lo(const uint32_t x, const uint32_t y, const uint32_t z)
+   uint32_t madc_lo(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-    uint32_t result;
-    asm volatile("madc.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+     uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t madc_hi(const uint32_t x, const uint32_t y, const uint32_t z)
+   uint32_t madc_hi(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-    uint32_t result;
-    asm volatile("madc.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+     uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t madc_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+   uint32_t madc_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-    uint32_t result;
-    asm volatile("madc.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+     uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint32_t madc_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+   uint32_t madc_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-    uint32_t result;
-    asm volatile("madc.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+     uint32_t result = 0;
    return result;
  }

-  __device__ __forceinline__ uint64_t mov_b64(uint32_t lo, uint32_t hi)
+   uint64_t mov_b64(uint32_t lo, uint32_t hi)
  {
-    uint64_t result;
-    asm("mov.b64 %0, {%1,%2};" : "=l"(result) : "r"(lo), "r"(hi));
+    uint64_t result = 0;
    return result;
  }

@@ -141,142 +121,124 @@ namespace ptx {
  // Callers should know exactly what they're calling (no implicit conversions).
  namespace u64 {

-    __device__ __forceinline__ uint64_t add(const uint64_t x, const uint64_t y)
+     uint64_t add(const uint64_t x, const uint64_t y)
    {
-      uint64_t result;
-      asm("add.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t add_cc(const uint64_t x, const uint64_t y)
+     uint64_t add_cc(const uint64_t x, const uint64_t y)
    {
-      uint64_t result;
-      asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t addc(const uint64_t x, const uint64_t y)
+     uint64_t addc(const uint64_t x, const uint64_t y)
    {
-      uint64_t result;
-      asm volatile("addc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t addc_cc(const uint64_t x, const uint64_t y)
+     uint64_t addc_cc(const uint64_t x, const uint64_t y)
    {
-      uint64_t result;
-      asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t sub(const uint64_t x, const uint64_t y)
+     uint64_t sub(const uint64_t x, const uint64_t y)
    {
-      uint64_t result;
-      asm("sub.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t sub_cc(const uint64_t x, const uint64_t y)
+     uint64_t sub_cc(const uint64_t x, const uint64_t y)
    {
-      uint64_t result;
-      asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t subc(const uint64_t x, const uint64_t y)
+     uint64_t subc(const uint64_t x, const uint64_t y)
    {
-      uint64_t result;
-      asm volatile("subc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t subc_cc(const uint64_t x, const uint64_t y)
+     uint64_t subc_cc(const uint64_t x, const uint64_t y)
    {
-      uint64_t result;
-      asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t mul_lo(const uint64_t x, const uint64_t y)
+     uint64_t mul_lo(const uint64_t x, const uint64_t y)
    {
-      uint64_t result;
-      asm("mul.lo.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t mul_hi(const uint64_t x, const uint64_t y)
+     uint64_t mul_hi(const uint64_t x, const uint64_t y)
    {
-      uint64_t result;
-      asm("mul.hi.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t mad_lo(const uint64_t x, const uint64_t y, const uint64_t z)
+     uint64_t mad_lo(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result;
-      asm("mad.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t mad_hi(const uint64_t x, const uint64_t y, const uint64_t z)
+     uint64_t mad_hi(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result;
-      asm("mad.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t mad_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+     uint64_t mad_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result;
-      asm volatile("mad.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t mad_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+     uint64_t mad_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result;
-      asm volatile("mad.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t madc_lo(const uint64_t x, const uint64_t y, const uint64_t z)
+     uint64_t madc_lo(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result;
-      asm volatile("madc.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t madc_hi(const uint64_t x, const uint64_t y, const uint64_t z)
+     uint64_t madc_hi(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result;
-      asm volatile("madc.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t madc_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+     uint64_t madc_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result;
-      asm volatile("madc.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      uint64_t result = 0;
      return result;
    }

-    __device__ __forceinline__ uint64_t madc_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+     uint64_t madc_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result;
-      asm volatile("madc.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      uint64_t result = 0;
      return result;
    }

  } // namespace u64

-  __device__ __forceinline__ void bar_arrive(const unsigned name, const unsigned count)
+   void bar_arrive(const unsigned name, const unsigned count)
  {
-    asm volatile("bar.arrive %0, %1;" : : "r"(name), "r"(count) : "memory");
+    return;
  }

-  __device__ __forceinline__ void bar_sync(const unsigned name, const unsigned count)
+   void bar_sync(const unsigned name, const unsigned count)
  {
-    asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(count) : "memory");
+    return;
  }

 } // namespace ptx
--- a/icicle/include/fields/quadratic_extension.cuh
+++ b/icicle/include/fields/quadratic_extension.cuh
@@ -1,8 +1,8 @@
 #pragma once

 #include "field.cuh"
-#include "gpu-utils/modifiers.cuh"
-#include "gpu-utils/sharedmem.cuh"
+#include "../gpu-utils/modifiers.cuh"
+#include "../gpu-utils/sharedmem.cuh"

 template <typename CONFIG>
 class ExtensionField
@@ -16,12 +16,12 @@ private:
    FWide real;
    FWide imaginary;

-    friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
+    friend ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
    {
      return ExtensionWide{xs.real + ys.real, xs.imaginary + ys.imaginary};
    }

-    friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
+    friend ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
    {
      return ExtensionWide{xs.real - ys.real, xs.imaginary - ys.imaginary};
    }
@@ -34,21 +34,21 @@ public:
  FF real;
  FF imaginary;

-  static constexpr HOST_DEVICE_INLINE ExtensionField zero() { return ExtensionField{FF::zero(), FF::zero()}; }
+  static constexpr ExtensionField zero() { return ExtensionField{FF::zero(), FF::zero()}; }

-  static constexpr HOST_DEVICE_INLINE ExtensionField one() { return ExtensionField{FF::one(), FF::zero()}; }
+  static constexpr ExtensionField one() { return ExtensionField{FF::one(), FF::zero()}; }

-  static constexpr HOST_DEVICE_INLINE ExtensionField to_montgomery(const ExtensionField& xs)
+  static constexpr ExtensionField to_montgomery(const ExtensionField& xs)
  {
    return ExtensionField{xs.real * FF{CONFIG::montgomery_r}, xs.imaginary * FF{CONFIG::montgomery_r}};
  }

-  static constexpr HOST_DEVICE_INLINE ExtensionField from_montgomery(const ExtensionField& xs)
+  static constexpr ExtensionField from_montgomery(const ExtensionField& xs)
  {
    return ExtensionField{xs.real * FF{CONFIG::montgomery_r_inv}, xs.imaginary * FF{CONFIG::montgomery_r_inv}};
  }

-  static HOST_INLINE ExtensionField rand_host() { return ExtensionField{FF::rand_host(), FF::rand_host()}; }
+  static ExtensionField rand_host() { return ExtensionField{FF::rand_host(), FF::rand_host()}; }

  static void rand_host_many(ExtensionField* out, int size)
  {
@@ -57,7 +57,7 @@ public:
  }

  template <unsigned REDUCTION_SIZE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs)
+  static constexpr ExtensionField sub_modulus(const ExtensionField& xs)
  {
    return ExtensionField{FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.imaginary)};
  }
@@ -68,38 +68,38 @@ public:
    return os;
  }

-  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
+  friend ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
  {
    return ExtensionField{xs.real + ys.real, xs.imaginary + ys.imaginary};
  }

-  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
+  friend ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
  {
    return ExtensionField{xs.real - ys.real, xs.imaginary - ys.imaginary};
  }

-  friend HOST_DEVICE_INLINE ExtensionField operator+(FF xs, const ExtensionField& ys)
+  friend ExtensionField operator+(FF xs, const ExtensionField& ys)
  {
    return ExtensionField{xs + ys.real, ys.imaginary};
  }

-  friend HOST_DEVICE_INLINE ExtensionField operator-(FF xs, const ExtensionField& ys)
+  friend ExtensionField operator-(FF xs, const ExtensionField& ys)
  {
    return ExtensionField{xs - ys.real, FF::neg(ys.imaginary)};
  }

-  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const FF& ys)
+  friend ExtensionField operator+(ExtensionField xs, const FF& ys)
  {
    return ExtensionField{xs.real + ys, xs.imaginary};
  }

-  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const FF& ys)
+  friend ExtensionField operator-(ExtensionField xs, const FF& ys)
  {
    return ExtensionField{xs.real - ys, xs.imaginary};
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
+  static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
  {
    FWide real_prod = FF::mul_wide(xs.real, ys.real);
    FWide imaginary_prod = FF::mul_wide(xs.imaginary, ys.imaginary);
@@ -110,40 +110,40 @@ public:
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
+  static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
  {
    return ExtensionWide{FF::mul_wide(xs.real, ys), FF::mul_wide(xs.imaginary, ys)};
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
+  static constexpr ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
  {
    return mul_wide(ys, xs);
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs)
+  static constexpr ExtensionField reduce(const ExtensionWide& xs)
  {
    return ExtensionField{
      FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.imaginary)};
  }

  template <class T1, class T2>
-  friend HOST_DEVICE_INLINE ExtensionField operator*(const T1& xs, const T2& ys)
+  friend ExtensionField operator*(const T1& xs, const T2& ys)
  {
    ExtensionWide xy = mul_wide(xs, ys);
    return reduce(xy);
  }

-  friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys)
+  friend bool operator==(const ExtensionField& xs, const ExtensionField& ys)
  {
    return (xs.real == ys.real) && (xs.imaginary == ys.imaginary);
  }

-  friend HOST_DEVICE_INLINE bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
+  friend bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }

  template <const ExtensionField& multiplier>
-  static HOST_DEVICE_INLINE ExtensionField mul_const(const ExtensionField& xs)
+  static ExtensionField mul_const(const ExtensionField& xs)
  {
    static constexpr FF mul_real = multiplier.real;
    static constexpr FF mul_imaginary = multiplier.imaginary;
@@ -159,33 +159,33 @@ public:
  }

  template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs)
+  static constexpr ExtensionField mul_unsigned(const ExtensionField& xs)
  {
    return {FF::template mul_unsigned<multiplier>(xs.real), FF::template mul_unsigned<multiplier>(xs.imaginary)};
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionWide sqr_wide(const ExtensionField& xs)
+  static constexpr ExtensionWide sqr_wide(const ExtensionField& xs)
  {
    // TODO: change to a more efficient squaring
    return mul_wide<MODULUS_MULTIPLE>(xs, xs);
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionField sqr(const ExtensionField& xs)
+  static constexpr ExtensionField sqr(const ExtensionField& xs)
  {
    // TODO: change to a more efficient squaring
    return xs * xs;
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionField neg(const ExtensionField& xs)
+  static constexpr ExtensionField neg(const ExtensionField& xs)
  {
    return ExtensionField{FF::neg(xs.real), FF::neg(xs.imaginary)};
  }

  // inverse of zero is set to be zero which is what we want most of the time
-  static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs)
+  static constexpr ExtensionField inverse(const ExtensionField& xs)
  {
    ExtensionField xs_conjugate = {xs.real, FF::neg(xs.imaginary)};
    FF nonresidue_times_im = FF::template mul_unsigned<CONFIG::nonresidue>(FF::sqr(xs.imaginary));
@@ -198,9 +198,9 @@ public:

 template <class CONFIG>
 struct SharedMemory<ExtensionField<CONFIG>> {
-  __device__ ExtensionField<CONFIG>* getPointer()
+  ExtensionField<CONFIG>* getPointer()
  {
-    extern __shared__ ExtensionField<CONFIG> s_ext2_scalar_[];
+    ExtensionField<CONFIG> *s_ext2_scalar_;
    return s_ext2_scalar_;
  }
 };
--- a/icicle/include/fields/quartic_extension.cuh
+++ b/icicle/include/fields/quartic_extension.cuh
@@ -1,8 +1,8 @@
 #pragma once

 #include "field.cuh"
-#include "gpu-utils/modifiers.cuh"
-#include "gpu-utils/sharedmem.cuh"
+#include "../gpu-utils/modifiers.cuh"
+#include "../gpu-utils/sharedmem.cuh"

 template <typename CONFIG>
 class ExtensionField
@@ -16,12 +16,12 @@ private:
    FWide im2;
    FWide im3;

-    friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
+    friend ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
    {
      return ExtensionWide{xs.real + ys.real, xs.im1 + ys.im1, xs.im2 + ys.im2, xs.im3 + ys.im3};
    }

-    friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
+    friend ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
    {
      return ExtensionWide{xs.real - ys.real, xs.im1 - ys.im1, xs.im2 - ys.im2, xs.im3 - ys.im3};
    }
@@ -36,31 +36,31 @@ public:
  FF im2;
  FF im3;

-  static constexpr HOST_DEVICE_INLINE ExtensionField zero()
+  static constexpr ExtensionField zero()
  {
    return ExtensionField{FF::zero(), FF::zero(), FF::zero(), FF::zero()};
  }

-  static constexpr HOST_DEVICE_INLINE ExtensionField one()
+  static constexpr ExtensionField one()
  {
    return ExtensionField{FF::one(), FF::zero(), FF::zero(), FF::zero()};
  }

-  static constexpr HOST_DEVICE_INLINE ExtensionField to_montgomery(const ExtensionField& xs)
+  static constexpr ExtensionField to_montgomery(const ExtensionField& xs)
  {
    return ExtensionField{
      xs.real * FF{CONFIG::montgomery_r}, xs.im1 * FF{CONFIG::montgomery_r}, xs.im2 * FF{CONFIG::montgomery_r},
      xs.im3 * FF{CONFIG::montgomery_r}};
  }

-  static constexpr HOST_DEVICE_INLINE ExtensionField from_montgomery(const ExtensionField& xs)
+  static constexpr ExtensionField from_montgomery(const ExtensionField& xs)
  {
    return ExtensionField{
      xs.real * FF{CONFIG::montgomery_r_inv}, xs.im1 * FF{CONFIG::montgomery_r_inv},
      xs.im2 * FF{CONFIG::montgomery_r_inv}, xs.im3 * FF{CONFIG::montgomery_r_inv}};
  }

-  static HOST_INLINE ExtensionField rand_host()
+  static ExtensionField rand_host()
  {
    return ExtensionField{FF::rand_host(), FF::rand_host(), FF::rand_host(), FF::rand_host()};
  }
@@ -72,7 +72,7 @@ public:
  }

  template <unsigned REDUCTION_SIZE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs)
+  static constexpr ExtensionField sub_modulus(const ExtensionField& xs)
  {
    return ExtensionField{
      FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.im1),
@@ -86,38 +86,38 @@ public:
    return os;
  }

-  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
+  friend ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
  {
    return ExtensionField{xs.real + ys.real, xs.im1 + ys.im1, xs.im2 + ys.im2, xs.im3 + ys.im3};
  }

-  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
+  friend ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
  {
    return ExtensionField{xs.real - ys.real, xs.im1 - ys.im1, xs.im2 - ys.im2, xs.im3 - ys.im3};
  }

-  friend HOST_DEVICE_INLINE ExtensionField operator+(FF xs, const ExtensionField& ys)
+  friend ExtensionField operator+(FF xs, const ExtensionField& ys)
  {
    return ExtensionField{xs + ys.real, ys.im1, ys.im2, ys.im3};
  }

-  friend HOST_DEVICE_INLINE ExtensionField operator-(FF xs, const ExtensionField& ys)
+  friend ExtensionField operator-(FF xs, const ExtensionField& ys)
  {
    return ExtensionField{xs - ys.real, FF::neg(ys.im1), FF::neg(ys.im2), FF::neg(ys.im3)};
  }

-  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const FF& ys)
+  friend ExtensionField operator+(ExtensionField xs, const FF& ys)
  {
    return ExtensionField{xs.real + ys, xs.im1, xs.im2, xs.im3};
  }

-  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const FF& ys)
+  friend ExtensionField operator-(ExtensionField xs, const FF& ys)
  {
    return ExtensionField{xs.real - ys, xs.im1, xs.im2, xs.im3};
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
+  static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
  {
    if (CONFIG::nonresidue_is_negative)
      return ExtensionWide{
@@ -144,21 +144,21 @@ public:
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
+  static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
  {
    return ExtensionWide{
      FF::mul_wide(xs.real, ys), FF::mul_wide(xs.im1, ys), FF::mul_wide(xs.im2, ys), FF::mul_wide(xs.im3, ys)};
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
+  static constexpr ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
  {
    return ExtensionWide{
      FF::mul_wide(xs, ys.real), FF::mul_wide(xs, ys.im1), FF::mul_wide(xs, ys.im2), FF::mul_wide(xs, ys.im3)};
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs)
+  static constexpr ExtensionField reduce(const ExtensionWide& xs)
  {
    return ExtensionField{
      FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.im1),
@@ -166,21 +166,21 @@ public:
  }

  template <class T1, class T2>
-  friend HOST_DEVICE_INLINE ExtensionField operator*(const T1& xs, const T2& ys)
+  friend ExtensionField operator*(const T1& xs, const T2& ys)
  {
    ExtensionWide xy = mul_wide(xs, ys);
    return reduce(xy);
  }

-  friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys)
+  friend bool operator==(const ExtensionField& xs, const ExtensionField& ys)
  {
    return (xs.real == ys.real) && (xs.im1 == ys.im1) && (xs.im2 == ys.im2) && (xs.im3 == ys.im3);
  }

-  friend HOST_DEVICE_INLINE bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
+  friend bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }

  template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs)
+  static constexpr ExtensionField mul_unsigned(const ExtensionField& xs)
  {
    return {
      FF::template mul_unsigned<multiplier>(xs.real), FF::template mul_unsigned<multiplier>(xs.im1),
@@ -188,27 +188,27 @@ public:
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionWide sqr_wide(const ExtensionField& xs)
+  static constexpr ExtensionWide sqr_wide(const ExtensionField& xs)
  {
    // TODO: change to a more efficient squaring
    return mul_wide<MODULUS_MULTIPLE>(xs, xs);
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionField sqr(const ExtensionField& xs)
+  static constexpr ExtensionField sqr(const ExtensionField& xs)
  {
    // TODO: change to a more efficient squaring
    return xs * xs;
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionField neg(const ExtensionField& xs)
+  static constexpr ExtensionField neg(const ExtensionField& xs)
  {
    return {FF::neg(xs.real), FF::neg(xs.im1), FF::neg(xs.im2), FF::neg(xs.im3)};
  }

  // inverse of zero is set to be zero which is what we want most of the time
-  static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs)
+  static constexpr ExtensionField inverse(const ExtensionField& xs)
  {
    FF x, x0, x2;
    if (CONFIG::nonresidue_is_negative) {
@@ -249,9 +249,9 @@ public:

 template <class CONFIG>
 struct SharedMemory<ExtensionField<CONFIG>> {
-  __device__ ExtensionField<CONFIG>* getPointer()
+  ExtensionField<CONFIG>* getPointer()
  {
-    extern __shared__ ExtensionField<CONFIG> s_ext4_scalar_[];
+    ExtensionField<CONFIG> *s_ext4_scalar_=nullptr;
    return s_ext4_scalar_;
  }
 };
--- a/icicle/include/fields/snark_fields/bn254_base.cuh
+++ b/icicle/include/fields/snark_fields/bn254_base.cuh
@@ -2,7 +2,7 @@
 #ifndef BN254_BASE_PARAMS_H
 #define BN254_BASE_PARAMS_H

-#include "fields/storage.cuh"
+#include "../storage.cuh"

 namespace bn254 {
  struct fq_config {
--- a/icicle/include/fields/snark_fields/bn254_scalar.cuh
+++ b/icicle/include/fields/snark_fields/bn254_scalar.cuh
@@ -2,9 +2,9 @@
 #ifndef BN254_SCALAR_PARAMS_H
 #define BN254_SCALAR_PARAMS_H

-#include "fields/storage.cuh"
-#include "fields/field.cuh"
-#include "fields/quadratic_extension.cuh"
+#include "../storage.cuh"
+#include "../field.cuh"
+#include "../quadratic_extension.cuh"

 namespace bn254 {
  struct fp_config {
--- a/icicle/include/fields/stark_fields/babybear.cuh
+++ b/icicle/include/fields/stark_fields/babybear.cuh
@@ -1,8 +1,8 @@
 #pragma once

-#include "fields/storage.cuh"
-#include "fields/field.cuh"
-#include "fields/quartic_extension.cuh"
+#include "../storage.cuh"
+#include "../field.cuh"
+#include "../quartic_extension.cuh"

 namespace babybear {
  struct fp_config {
--- a/icicle/include/gpu-utils/device_context.cuh
+++ b/icicle/include/gpu-utils/device_context.cuh
@@ -1,20 +1,20 @@
 #pragma once
 #ifndef DEVICE_CONTEXT_H
 #define DEVICE_CONTEXT_H
-
-#include <cuda_runtime.h>
+#include <cstddef>

 namespace device_context {

-  constexpr std::size_t MAX_DEVICES = 32;
+  size_t MAX_DEVICES = 32;
+  

  /**
   * Properties of the device used in icicle functions.
   */
  struct DeviceContext {
-    cudaStream_t& stream;  /**< Stream to use. Default value: 0. */
+    int stream;  /**< Stream to use. Default value: 0. */
    std::size_t device_id; /**< Index of the currently used GPU. Default value: 0. */
-    cudaMemPool_t mempool; /**< Mempool to use. Default value: 0. */
+    int mempool; /**< Mempool to use. Default value: 0. */
  };

  /**
@@ -22,9 +22,9 @@ namespace device_context {
   */
  inline DeviceContext get_default_device_context() // TODO: naming convention ?
  {
-    static cudaStream_t default_stream = (cudaStream_t)0;
+    static int default_stream = 0;
    return DeviceContext{
-      (cudaStream_t&)default_stream, // stream
+      default_stream, // stream
      0,                             // device_id
      0,                             // mempool
    };
--- a/icicle/include/gpu-utils/error_handler.cuh
+++ b/icicle/include/gpu-utils/error_handler.cuh
@@ -3,12 +3,10 @@
 #define ERR_H

 #include <iostream>
-
-#include <cuda_runtime.h>
 #include <stdexcept>
 #include <string>

-enum class IcicleError_t {
+enum IcicleError_t {
  IcicleSuccess = 0,
  InvalidArgument = 1,
  MemoryAllocationError = 2,
@@ -38,14 +36,14 @@ private:

 public:
  // Constructor for cudaError_t with optional message
-  IcicleError(cudaError_t cudaError, const std::string& msg = "")
-      : std::runtime_error("CUDA Error: " + std::string(cudaGetErrorString(cudaError)) + " " + msg),
+  IcicleError(int cudaError, const std::string& msg = "")
+      : std::runtime_error("Error: " + msg),
        errCode(static_cast<int>(cudaError))
  {
  }

  // Constructor for cudaError_t with const char* message
-  IcicleError(cudaError_t cudaError, const char* msg) : IcicleError(cudaError, std::string(msg)) {}
+  IcicleError(int cudaError, const char* msg) : IcicleError(cudaError, std::string(msg)) {}

  // Constructor for IcicleError_t with optional message
  IcicleError(IcicleError_t icicleError, const std::string& msg = "")
@@ -67,11 +65,10 @@ public:
 #define CHK_LOG(val)                   check((val), #val, __FILE__, __LINE__)
 #define CHK_VAL(val, file, line)       check((val), #val, file, line)

-cudaError_t inline check(cudaError_t err, const char* const func, const char* const file, const int line)
+int inline check(int err, const char* const func, const char* const file, const int line)
 {
-  if (err != cudaSuccess) {
+  if (err != 0) {
    std::cerr << "CUDA Runtime Error by: " << func << " at: " << file << ":" << line << std::endl;
-    std::cerr << cudaGetErrorString(err) << std::endl << std::endl;
  }

  return err;
@@ -90,12 +87,12 @@ cudaError_t inline check(cudaError_t err, const char* const func, const char* co
 #define THROW_ICICLE_CUDA(val)                       throwIcicleCudaErr(val, __FUNCTION__, __FILE__, __LINE__)
 #define THROW_ICICLE_CUDA_ERR(val, func, file, line) throwIcicleCudaErr(val, func, file, line)
 void inline throwIcicleCudaErr(
-  cudaError_t err, const char* const func, const char* const file, const int line, bool isUnrecoverable = true)
+  int err, const char* const func, const char* const file, const int line, bool isUnrecoverable = true)
 {
  // TODO: fmt::format introduced only in C++20
-  std::string err_msg = (isUnrecoverable ? "!!!Unrecoverable!!! : " : "") + std::string{cudaGetErrorString(err)} +
-                        " : detected by: " + func + " at: " + file + ":" + std::to_string(line) +
-                        "\nThe error is reported there and may be caused by prior calls.\n";
+  std::string err_msg = (isUnrecoverable ? "!!!Unrecoverable!!! : " : "");
+  //  + " : detected by: " + func + " at: " + file + ":" + std::to_string(line) +
+  //                       "\nThe error is reported there and may be caused by prior calls.\n";
  std::cerr << err_msg << std::endl; // TODO: Logging
  throw IcicleError{err, err_msg};
 }
@@ -111,14 +108,14 @@ void inline throwIcicleErr(
  throw IcicleError{err, err_msg};
 }

-cudaError_t inline checkCudaErrorIsSticky(
-  cudaError_t err, const char* const func, const char* const file, const int line, bool isThrowing = true)
+int inline checkCudaErrorIsSticky(
+  int err, const char* const func, const char* const file, const int line, bool isThrowing = true)
 {
-  if (err != cudaSuccess) {
+  if (err != 0) {
    // check for sticky (unrecoverable) error when the only option is to restart process
-    cudaError_t err2 = cudaDeviceSynchronize();
+    int err2 = 0;
    bool is_logged;
-    if (err2 != cudaSuccess) { // we suspect sticky error
+    if (err2 != 0) { // we suspect sticky error
      if (err != err2) {
        is_logged = true;
        CHK_ERR(err, func, file, line);
@@ -139,13 +136,13 @@ cudaError_t inline checkCudaErrorIsSticky(
 // most common macros to use
 #define CHK_INIT_IF_RETURN()                                                                                           \
  {                                                                                                                    \
-    cudaError_t err_result = CHK_LAST();                                                                               \
+    int err_result = CHK_LAST();                                                                               \
    if (err_result != cudaSuccess) return err_result;                                                                  \
  }

 #define CHK_IF_RETURN(val)                                                                                             \
  {                                                                                                                    \
-    cudaError_t err_result = CHK_STICKY(val);                                                                          \
+    int err_result = CHK_STICKY(val);                                                                          \
    if (err_result != cudaSuccess) return err_result;                                                                  \
  }

--- a/icicle/include/gpu-utils/modifiers.cuh
+++ b/icicle/include/gpu-utils/modifiers.cuh
@@ -6,6 +6,6 @@
 #define UNROLL       #pragma unroll
 #endif

-#define HOST_INLINE        __host__ INLINE_MACRO
-#define DEVICE_INLINE      __device__ INLINE_MACRO
-#define HOST_DEVICE_INLINE __host__ __device__ INLINE_MACRO
+// #define        __host__ INLINE_MACRO
+// #define      INLINE_MACRO
+// #define __host__ INLINE_MACRO
--- a/icicle/include/gpu-utils/sharedmem.cuh
+++ b/icicle/include/gpu-utils/sharedmem.cuh
@@ -24,7 +24,7 @@
 * definitions.
 *
 * To use dynamically allocated shared memory in a templatized __global__ or
- * __device__ function, just replace code like this:
+ * function, just replace code like this:
 *
 * <pre>
 *  template<class T>
@@ -32,7 +32,7 @@
 *  foo( T* d_out, T* d_in)
 *  {
 *      // Shared mem size is determined by the host app at run time
- *      extern __shared__  T sdata[];
+ *       T sdata[];
 *      ...
 *      doStuff(sdata);
 *      ...
@@ -62,7 +62,7 @@
 *
 * This struct uses template specialization on the type \a T to declare
 * a differently named dynamic shared memory array for each type
- * (\code extern __shared__ T s_type[] \endcode).
+ * (\code T s_type[] \endcode).
 *
 * Currently there are specializations for the following types:
 * \c int, \c uint, \c char, \c uchar, \c short, \c ushort, \c long,
@@ -73,11 +73,10 @@ template <typename T>
 struct SharedMemory {
  //! @brief Return a pointer to the runtime-sized shared memory array.
  //! @returns Pointer to runtime-sized shared memory array
-  __device__ T* getPointer()
+  T* getPointer()
  {
-    // extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
-    // Error_UnsupportedType();
-    return (T*)0;
+    T* a = nullptr; // Initialize pointer to nullptr or allocate memory as needed
+    return a;
  }
  // TODO: Use operator overloading to make this class look like a regular array
 };
@@ -88,129 +87,128 @@ struct SharedMemory {

 template <>
 struct SharedMemory<int> {
-  __device__ int* getPointer()
+  int* getPointer()
  {
-    extern __shared__ int s_int[];
-    return s_int;
+    return 0;
  }
 };

 template <>
 struct SharedMemory<unsigned int> {
-  __device__ unsigned int* getPointer()
+  unsigned int* getPointer()
  {
-    extern __shared__ unsigned int s_uint[];
-    return s_uint;
+    return 0;
  }
 };

 template <>
 struct SharedMemory<char> {
-  __device__ char* getPointer()
+  char* getPointer()
  {
-    extern __shared__ char s_char[];
-    return s_char;
+    char *a = nullptr;
+    return a;
  }
 };

 template <>
 struct SharedMemory<unsigned char> {
-  __device__ unsigned char* getPointer()
+  unsigned char* getPointer()
  {
-    extern __shared__ unsigned char s_uchar[];
-    return s_uchar;
+    unsigned char* a = nullptr;
+    return a;
  }
 };

 template <>
 struct SharedMemory<short> {
-  __device__ short* getPointer()
+  short* getPointer()
  {
-    extern __shared__ short s_short[];
-    return s_short;
+    short* a = nullptr;
+    return a;
  }
 };

 template <>
 struct SharedMemory<unsigned short> {
-  __device__ unsigned short* getPointer()
+  unsigned short* getPointer()
  {
-    extern __shared__ unsigned short s_ushort[];
-    return s_ushort;
+    unsigned short* a = nullptr;
+    return a;
  }
 };

 template <>
 struct SharedMemory<long> {
-  __device__ long* getPointer()
+  long* getPointer()
  {
-    extern __shared__ long s_long[];
+    long *s_long = nullptr;
    return s_long;
  }
 };

 template <>
 struct SharedMemory<unsigned long> {
-  __device__ unsigned long* getPointer()
+  unsigned long* getPointer()
  {
-    extern __shared__ unsigned long s_ulong[];
+    unsigned long *s_ulong = nullptr;
    return s_ulong;
  }
 };

 template <>
 struct SharedMemory<long long> {
-  __device__ long long* getPointer()
+  long long* getPointer()
  {
-    extern __shared__ long long s_longlong[];
+    long long *s_longlong;
    return s_longlong;
  }
 };

 template <>
 struct SharedMemory<unsigned long long> {
-  __device__ unsigned long long* getPointer()
+  unsigned long long* getPointer()
  {
-    extern __shared__ unsigned long long s_ulonglong[];
+    unsigned long long *s_ulonglong;
    return s_ulonglong;
  }
 };

 template <>
 struct SharedMemory<bool> {
-  __device__ bool* getPointer()
+  bool* getPointer()
  {
-    extern __shared__ bool s_bool[];
+    bool *s_bool;
    return s_bool;
  }
 };

 template <>
 struct SharedMemory<float> {
-  __device__ float* getPointer()
+  float* getPointer()
  {
-    extern __shared__ float s_float[];
+    float *s_float;
    return s_float;
  }
 };

 template <>
 struct SharedMemory<double> {
-  __device__ double* getPointer()
+  double* getPointer()
  {
-    extern __shared__ double s_double[];
+    double *s_double;
    return s_double;
  }
 };

-template <>
-struct SharedMemory<uchar4> {
-  __device__ uchar4* getPointer()
-  {
-    extern __shared__ uchar4 s_uchar4[];
-    return s_uchar4;
-  }
-};
+
+// template <>
+// struct SharedMemory<uchar4> {
+//   uchar4* getPointer()
+//   {
+//     uchar4 *s_uchar4;
+//     return s_uchar4;
+//   }
+// };

 #endif //_SHAREDMEM_H_

--- a/icicle/include/hash/keccak/keccak.cuh
+++ b/icicle/include/hash/keccak/keccak.cuh
@@ -3,9 +3,9 @@
 #define KECCAK_H

 #include <cstdint>
-#include "gpu-utils/device_context.cuh"
-#include "gpu-utils/error_handler.cuh"
-
+#include "../../gpu-utils/device_context.cuh"
+#include "../../gpu-utils/error_handler.cuh"
+typedef int cudaError_t;
 namespace keccak {
  /**
   * @struct KeccakConfig
--- a/icicle/include/msm/msm.cuh
+++ b/icicle/include/msm/msm.cuh
@@ -2,13 +2,12 @@
 #ifndef MSM_H
 #define MSM_H

-#include <cuda_runtime.h>

-#include "curves/affine.cuh"
-#include "curves/projective.cuh"
-#include "fields/field.cuh"
-#include "gpu-utils/device_context.cuh"
-#include "gpu-utils/error_handler.cuh"
+#include "../curves/affine.cuh"
+#include "../curves/projective.cuh"
+#include "../fields/field.cuh"
+#include "../gpu-utils/device_context.cuh"
+#include "../gpu-utils/error_handler.cuh"

 /**
 * @namespace msm
@@ -124,6 +123,8 @@ namespace msm {
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   *
   */
+
+  typedef int cudaError_t;
  template <typename S, typename A, typename P>
  cudaError_t msm(const S* scalars, const A* points, int msm_size, MSMConfig& config, P* results);

--- a/icicle/include/ntt/ntt.cuh
+++ b/icicle/include/ntt/ntt.cuh
@@ -2,13 +2,11 @@
 #ifndef NTT_H
 #define NTT_H

-#include <cuda_runtime.h>
-
-#include "gpu-utils/device_context.cuh"
-#include "gpu-utils/error_handler.cuh"
-#include "gpu-utils/sharedmem.cuh"
-#include "utils/utils_kernels.cuh"
-#include "utils/utils.h"
+#include "../gpu-utils/device_context.cuh"
+#include "../gpu-utils/error_handler.cuh"
+#include "../gpu-utils/sharedmem.cuh"
+#include "../utils/utils_kernels.cuh"
+#include "../utils/utils.h"

 /**
 * @namespace ntt
@@ -36,6 +34,8 @@ namespace ntt {
   * primitive_root).
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   */
+  typedef int cudaError_t;
+
  template <typename S>
  cudaError_t init_domain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode = false);

--- a/icicle/include/ntt/ntt_impl.cuh
+++ b/icicle/include/ntt/ntt_impl.cuh
@@ -3,8 +3,9 @@
 #define _NTT_IMPL_H

 #include <stdint.h>
-#include "ntt/ntt.cuh" // for enum Ordering
-
+#include "ntt.cuh" // for enum Ordering
+typedef int cudaError_t;
+typedef int cudaStream_t;
 namespace mxntt {

  template <typename S>
--- a/icicle/include/polynomials/cuda_backend/polynomial_cuda_backend.cuh
+++ b/icicle/include/polynomials/cuda_backend/polynomial_cuda_backend.cuh
@@ -1,8 +1,8 @@
 #pragma once

-#include "gpu-utils/device_context.cuh"
-#include "fields/field_config.cuh"
-#include "polynomials/polynomials.h"
+#include "../../gpu-utils/device_context.cuh"
+#include "../../fields/field_config.cuh"
+#include "../polynomials.h"

 using device_context::DeviceContext;

@@ -11,7 +11,7 @@ namespace polynomials {
  class CUDAPolynomialFactory : public AbstractPolynomialFactory<C, D, I>
  {
    std::vector<DeviceContext> m_device_contexts; // device-id --> device context
-    std::vector<cudaStream_t> m_device_streams;   // device-id --> device stream. Storing the streams here as workaround
+    std::vector<int> m_device_streams;   // device-id --> device stream. Storing the streams here as workaround
                                                  // since DeviceContext has a reference to a stream.

  public:
--- a/icicle/include/polynomials/polynomial_context.h
+++ b/icicle/include/polynomials/polynomial_context.h
@@ -6,7 +6,7 @@
 #include <algorithm> // for std::max
 #include <cstdint>   // for uint64_t, etc.
 #include <memory>
-#include "utils/integrity_pointer.h"
+#include "../utils/integrity_pointer.h"

 namespace polynomials {

--- a/icicle/include/polynomials/polynomials.h
+++ b/icicle/include/polynomials/polynomials.h
@@ -2,8 +2,8 @@

 #include <iostream>
 #include <memory>
-#include "utils/integrity_pointer.h"
-#include "fields/field_config.cuh"
+#include "../utils/integrity_pointer.h"
+#include "../fields/field_config.cuh"

 #include "polynomial_context.h"
 #include "polynomial_backend.h"
--- a/icicle/include/poseidon/poseidon.cuh
+++ b/icicle/include/poseidon/poseidon.cuh
@@ -4,9 +4,9 @@

 #include <cstdint>
 #include <stdexcept>
-#include "gpu-utils/device_context.cuh"
-#include "gpu-utils/error_handler.cuh"
-#include "utils/utils.h"
+#include "../gpu-utils/device_context.cuh"
+#include "../gpu-utils/error_handler.cuh"
+#include "../utils/utils.h"

 /**
 * @namespace poseidon
@@ -117,6 +117,7 @@ namespace poseidon {
  /**
   * Loads pre-calculated optimized constants, moves them to the device
   */
+  typedef int cudaError_t;
  template <typename S>
  cudaError_t
  init_optimized_poseidon_constants(int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* constants);
--- a/icicle/include/utils/mont.cuh
+++ b/icicle/include/utils/mont.cuh
@@ -9,7 +9,7 @@ namespace mont {
 #define MAX_THREADS_PER_BLOCK 256

    template <typename E, bool is_into>
-    __global__ void MontgomeryKernel(const E* input, int n, E* output)
+    void MontgomeryKernel(const E* input, int n, E* output)
    {
      int tid = blockIdx.x * blockDim.x + threadIdx.x;
      if (tid < n) { output[tid] = is_into ? E::to_montgomery(input[tid]) : E::from_montgomery(input[tid]); }
--- a/icicle/include/utils/utils_kernels.cuh
+++ b/icicle/include/utils/utils_kernels.cuh
@@ -4,14 +4,13 @@

 namespace utils_internal {
  template <typename E, typename S>
-  __global__ void NormalizeKernel(E* arr, S scalar, int n)
+  void NormalizeKernel(E* arr, S scalar, int n)
  {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid < n) { arr[tid] = scalar * arr[tid]; }
+    return;
  }

  template <typename E, typename S>
-  __global__ void BatchMulKernel(
+  void BatchMulKernel(
    const E* in_vec,
    int n_elements,
    int batch_size,
@@ -22,12 +21,7 @@ namespace utils_internal {
    bool bitrev,
    E* out_vec)
  {
-    int tid = blockDim.x * blockIdx.x + threadIdx.x;
-    if (tid < n_elements * batch_size) {
-      int64_t scalar_id = tid % n_elements;
-      if (bitrev) scalar_id = __brev(scalar_id) >> (32 - logn);
-      out_vec[tid] = *(scalar_vec + ((scalar_id * step) % n_scalars)) * in_vec[tid];
-    }
+    return;
  }

 } // namespace utils_internal
--- a/icicle/include/vec_ops/vec_ops.cuh
+++ b/icicle/include/vec_ops/vec_ops.cuh
@@ -2,7 +2,7 @@
 #ifndef LDE_H
 #define LDE_H

-#include "gpu-utils/device_context.cuh"
+#include "../gpu-utils/device_context.cuh"

 /**
 * @namespace vec_ops
@@ -57,6 +57,7 @@ namespace vec_ops {
   * @tparam E The type of elements `vec_b` and `result`. Often (but not always) `E=S`.
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   */
+  typedef int cudaError_t;
  template <typename E, typename S>
  cudaError_t Mul(const S* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result);

--- a/icicle/src/curves/CMakeLists.txt
+++ b/icicle/src/curves/CMakeLists.txt
@@ -1,29 +1,26 @@
-if (G2)
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2")
-endif ()
-
 set(TARGET icicle_curve)
 set(FIELD_TARGET icicle_field)

-set(SRC ${CMAKE_SOURCE_DIR}/src)
+set(SRC ../../)

-set(CURVE_SOURCE ${SRC}/curves/extern.cu)
+set(CURVE_SOURCE ${SRC}/curves/extern.cpp)
 if(G2)
-  list(APPEND CURVE_SOURCE ${SRC}/curves/extern_g2.cu)
-endif()
-if(MSM)
-  list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cu)
-  if(G2)
-    list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cu)
-  endif()
-endif()
-if(ECNTT)
-  list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cu)
-  list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cu)
+  list(APPEND CURVE_SOURCE ${SRC}/curves/extern_g2.cpp)
 endif()
+# if(MSM)
+#   list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
+#   if(G2)
+#     list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
+#   endif()
+# endif()
+# if(ECNTT)
+#   list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
+#   list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
+# endif()

 add_library(${TARGET} STATIC ${CURVE_SOURCE})
 target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
 target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
-target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
+target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
+target_compile_features(${TARGET} PUBLIC cxx_std_17)
--- a/icicle/src/curves/extern.cpp
+++ b/icicle/src/curves/extern.cpp
@@ -0,0 +1,40 @@
+#define CURVE_ID BN254
+#include "../../include/curves/curve_config.cuh"
+
+using namespace curve_config;
+
+#include "../../include/gpu-utils/device_context.cuh"
+#include "../../include/utils/utils.h"
+// #include "../utils/mont.cuh"
+
+extern "C" bool CONCAT_EXPAND(CURVE, eq)(projective_t* point1, projective_t* point2)
+{
+  return true;
+}
+
+extern "C" void CONCAT_EXPAND(CURVE, to_affine)(projective_t* point, affine_t* point_out)
+{
+  return;
+}
+
+extern "C" void CONCAT_EXPAND(CURVE, generate_projective_points)(projective_t* points, int size)
+{
+  return;
+}
+
+extern "C" void CONCAT_EXPAND(CURVE, generate_affine_points)(affine_t* points, int size)
+{
+  return;
+}
+
+extern "C" int CONCAT_EXPAND(CURVE, affine_convert_montgomery)(
+  affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+{
+  return 0;
+}
+
+extern "C" int CONCAT_EXPAND(CURVE, projective_convert_montgomery)(
+  projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+{
+  return 0;
+}
--- a/icicle/src/curves/extern_g2.cpp
+++ b/icicle/src/curves/extern_g2.cpp
@@ -0,0 +1,39 @@
+#include "curves/curve_config.cuh"
+
+using namespace curve_config;
+
+#include "gpu-utils/device_context.cuh"
+#include "utils/utils.h"
+#include "utils/mont.cuh"
+
+extern "C" bool CONCAT_EXPAND(CURVE, g2_eq)(g2_projective_t* point1, g2_projective_t* point2)
+{
+  return true;
+}
+
+extern "C" void CONCAT_EXPAND(CURVE, g2_to_affine)(g2_projective_t* point, g2_affine_t* point_out)
+{
+  return;
+}
+
+extern "C" void CONCAT_EXPAND(CURVE, g2_generate_projective_points)(g2_projective_t* points, int size)
+{
+  return;
+}
+
+extern "C" void CONCAT_EXPAND(CURVE, g2_generate_affine_points)(g2_affine_t* points, int size)
+{
+  return;
+}
+
+extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_affine_convert_montgomery)(
+  g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+{
+  return 0;
+}
+
+extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_projective_convert_montgomery)(
+  g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+{
+  return 0;
+}
--- a/icicle/src/fields/CMakeLists.txt
+++ b/icicle/src/fields/CMakeLists.txt
@@ -1,40 +1,37 @@
-if (EXT_FIELD)
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DEXT_FIELD")
-endif ()
-
 SET(SUPPORTED_FIELDS_WITHOUT_NTT grumpkin)

 set(TARGET icicle_field)

-set(SRC ${CMAKE_SOURCE_DIR}/src)
+set(SRC ../../)

-set(FIELD_SOURCE ${SRC}/fields/extern.cu)
-list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern.cu)
+set(FIELD_SOURCE ${SRC}/fields/extern.cpp)
+# list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern.cu)
 if(EXT_FIELD)
-  list(APPEND FIELD_SOURCE ${SRC}/fields/extern_extension.cu)
-  list(APPEND FIELD_SOURCE ${SRC}/ntt/extern_extension.cu)
-  list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern_extension.cu)
+  list(APPEND FIELD_SOURCE ${SRC}/fields/extern_extension.cpp)
+  # list(APPEND FIELD_SOURCE ${SRC}/ntt/extern_extension.cu)
+  # list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern_extension.cu)
 endif()

-set(POLYNOMIAL_SOURCE_FILES 
-    ${SRC}/polynomials/polynomials.cu
-    ${SRC}/polynomials/cuda_backend/polynomial_cuda_backend.cu
-    ${SRC}/polynomials/polynomials_c_api.cu)
+# set(POLYNOMIAL_SOURCE_FILES 
+#     ${SRC}/polynomials/polynomials.cu
+#     ${SRC}/polynomials/cuda_backend/polynomial_cuda_backend.cu
+#     ${SRC}/polynomials/polynomials_c_api.cu)

-list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES})
+# list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES})

 # TODO: impl poseidon for small fields. note that it needs to be defined over the extension field!
-if (DEFINED CURVE)
-  list(APPEND FIELD_SOURCE ${SRC}/poseidon/poseidon.cu)
-  list(APPEND FIELD_SOURCE ${SRC}/poseidon/tree/merkle.cu)
-endif()
+# if (DEFINED CURVE)
+#   list(APPEND FIELD_SOURCE ${SRC}/poseidon/poseidon.cu)
+#   list(APPEND FIELD_SOURCE ${SRC}/poseidon/tree/merkle.cu)
+# endif()

-if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
-  list(APPEND FIELD_SOURCE ${SRC}/ntt/extern.cu)
-  list(APPEND FIELD_SOURCE ${SRC}/ntt/kernel_ntt.cu)
-endif()
+# if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
+#   list(APPEND FIELD_SOURCE ${SRC}/ntt/extern.cu)
+#   list(APPEND FIELD_SOURCE ${SRC}/ntt/kernel_ntt.cu)
+# endif()

 add_library(${TARGET} STATIC ${FIELD_SOURCE})
 target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_field_${FIELD}")
 target_compile_definitions(${TARGET} PUBLIC FIELD=${FIELD})
+target_compile_features(${TARGET} PUBLIC cxx_std_17)
--- a/icicle/src/fields/extern.cpp
+++ b/icicle/src/fields/extern.cpp
@@ -0,0 +1,19 @@
+#define FIELD_ID BN254 
+#include "../../include/fields/field_config.cuh"
+
+using namespace field_config;
+
+//#include "../../include/utils/mont.cuh"
+#include "../../include/utils/utils.h"
+#include "../../include/gpu-utils/device_context.cuh"
+
+extern "C" void CONCAT_EXPAND(FIELD, generate_scalars)(scalar_t* scalars, int size)
+{
+  return;
+}
+
+extern "C" int CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(
+  scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+{
+  return 0;
+}
--- a/icicle/src/fields/extern_extension.cpp
+++ b/icicle/src/fields/extern_extension.cpp
@@ -0,0 +1,18 @@
+#include "fields/field_config.cuh"
+
+using namespace field_config;
+
+#include "utils/mont.cuh"
+#include "utils/utils.h"
+#include "gpu-utils/device_context.cuh"
+
+extern "C" void CONCAT_EXPAND(FIELD, extension_generate_scalars)(extension_t* scalars, int size)
+{
+  return;
+}
+
+extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)(
+  extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+{
+  return 0;
+}
--- a/icicle/src/hash/CMakeLists.txt
+++ b/icicle/src/hash/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET icicle_hash)

-add_library(${TARGET} STATIC keccak/keccak.cu)
+add_library(${TARGET} STATIC keccak/keccak.cpp)
 target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_hash")
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_hash")
+target_compile_features(${TARGET} PUBLIC cxx_std_17)
--- a/icicle/src/hash/keccak/keccak.cpp
+++ b/icicle/src/hash/keccak/keccak.cpp
@@ -0,0 +1,24 @@
+#include "../../../include/hash/keccak/keccak.cuh"
+
+typedef int cudaError_t;
+
+namespace keccak {
+  template <int C, int D>
+  cudaError_t
+  keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
+  {
+    return 0;
+  }
+
+  extern "C" cudaError_t
+  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
+  {
+    return 0;
+  }
+
+  extern "C" cudaError_t
+  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
+  {
+    return 0;
+  }
+} // namespace keccak
--- a/icicle/src/hash/keccak/keccak.cu
+++ b/icicle/src/hash/keccak/keccak.cu
@@ -144,14 +144,14 @@ namespace keccak {
    element ^= rc;                                                                                                     \
  }

-  __device__ const uint64_t RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
+  const uint64_t RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
                                      0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
                                      0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
                                      0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
                                      0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
                                      0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};

-  __device__ void keccakf(uint64_t s[25])
+  void keccakf(uint64_t s[25])
  {
    uint64_t t0, t1, t2, t3, t4;

--- a/icicle/src/msm/CMakeLists.txt
+++ b/icicle/src/msm/CMakeLists.txt
@@ -0,0 +1,28 @@
+set(TARGET icicle_msm)
+set(CURVE_TARGET icicle_curve)
+set(FIELD_TARGET icicle_field)
+
+set(SRC ../)
+
+set(MSM_SOURCE ${SRC}/msm/extern.cpp)
+if(G2)
+  list(APPEND MSM_SOURCE ${SRC}/msm/extern_g2.cpp)
+endif()
+# if(MSM)
+#   list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
+#   if(G2)
+#     list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
+#   endif()
+# endif()
+# if(ECNTT)
+#   list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
+#   list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
+# endif()
+
+add_library(${TARGET} STATIC ${MSM_SOURCE})
+target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
+target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
+target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
+target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
+target_compile_features(${TARGET} PUBLIC cxx_std_17)
--- a/icicle/src/msm/extern.cpp
+++ b/icicle/src/msm/extern.cpp
@@ -0,0 +1,46 @@
+#define CURVE_ID BN254
+#define FIELD_ID BN254
+#include "../../include/curves/curve_config.cuh"
+#include "../../include/fields/field_config.cuh"
+#include "../../include/gpu-utils/device_context.cuh"
+#include "../../include/msm/msm.cuh"
+
+typedef int cudaError_t;
+using namespace curve_config;
+using namespace field_config;
+
+#include "../../include/utils/utils.h"
+
+namespace msm {
+  /**
+   * Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
+   * template parameters (where the curve is given by `-DCURVE` env variable during build):
+   *  - `A` is the [affine representation](@ref affine_t) of curve points;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, precompute_msm_bases_cuda)(
+    affine_t* bases,
+    int bases_size,
+    int precompute_factor,
+    int _c,
+    bool are_bases_on_device,
+    device_context::DeviceContext& ctx,
+    affine_t* output_bases)
+  {
+    return 0;
+  }
+
+  /**
+   * Extern "C" version of [msm](@ref msm) function with the following values of template parameters
+   * (where the curve is given by `-DCURVE` env variable during build):
+   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
+   *  - `A` is the [affine representation](@ref affine_t) of curve points;
+   *  - `P` is the [projective representation](@ref projective_t) of curve points.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, msm_cuda)(
+    const scalar_t* scalars, const affine_t* points, int msm_size, MSMConfig& config, projective_t* out)
+  {
+    return 0;
+  }
+} // namespace msm
--- a/icicle/src/msm/extern_g2.cpp
+++ b/icicle/src/msm/extern_g2.cpp
@@ -0,0 +1,43 @@
+#include "curves/curve_config.cuh"
+#include "fields/field_config.cuh"
+
+using namespace curve_config;
+using namespace field_config;
+
+#include "msm.cu"
+#include "utils/utils.h"
+
+namespace msm {
+  /**
+   * Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
+   * template parameters (where the curve is given by `-DCURVE` env variable during build):
+   *  - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_precompute_msm_bases_cuda)(
+    g2_affine_t* bases,
+    int bases_size,
+    int precompute_factor,
+    int _c,
+    bool are_bases_on_device,
+    device_context::DeviceContext& ctx,
+    g2_affine_t* output_bases)
+  {
+    return precompute_msm_bases<g2_affine_t, g2_projective_t>(
+      bases, bases_size, precompute_factor, _c, are_bases_on_device, ctx, output_bases);
+  }
+
+  /**
+   * Extern "C" version of [msm](@ref msm) function with the following values of template parameters
+   * (where the curve is given by `-DCURVE` env variable during build):
+   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
+   *  - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
+   *  - `P` is the [projective representation](@ref g2_projective_t) of G2 curve points.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_msm_cuda)(
+    const scalar_t* scalars, const g2_affine_t* points, int msm_size, MSMConfig& config, g2_projective_t* out)
+  {
+    return msm<scalar_t, g2_affine_t, g2_projective_t>(scalars, points, msm_size, config, out);
+  }
+} // namespace msm
--- a/icicle/src/msm/tests/msm_test.cu
+++ b/icicle/src/msm/tests/msm_test.cu
@@ -20,32 +20,32 @@ public:
  unsigned p = 10;
  // unsigned p = 1<<30;

-  static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
+  static Dummy_Scalar zero() { return {0}; }

-  static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
+  static Dummy_Scalar one() { return {1}; }

-  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
+  friend std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
  {
    os << scalar.x;
    return os;
  }

-  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
+  unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
  {
    return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
  }

-  friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
+  friend Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
  {
    return {(p1.x + p2.x) % p1.p};
  }

-  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
+  friend bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }

-  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
+  friend bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }

-  static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
-  static HOST_INLINE Dummy_Scalar rand_host()
+  static Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
+  static Dummy_Scalar rand_host()
  {
    return {(unsigned)rand() % 10};
    // return {(unsigned)rand()};
@@ -57,32 +57,32 @@ class Dummy_Projective
 public:
  Dummy_Scalar x;

-  static HOST_DEVICE_INLINE Dummy_Projective zero() { return {0}; }
+  static Dummy_Projective zero() { return {0}; }

-  static HOST_DEVICE_INLINE Dummy_Projective one() { return {1}; }
+  static Dummy_Projective one() { return {1}; }

-  static HOST_DEVICE_INLINE Dummy_Projective to_affine(const Dummy_Projective& point) { return {point.x}; }
+  static Dummy_Projective to_affine(const Dummy_Projective& point) { return {point.x}; }

-  static HOST_DEVICE_INLINE Dummy_Projective from_affine(const Dummy_Projective& point) { return {point.x}; }
+  static Dummy_Projective from_affine(const Dummy_Projective& point) { return {point.x}; }

-  static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective& point) { return {Dummy_Scalar::neg(point.x)}; }
+  static Dummy_Projective neg(const Dummy_Projective& point) { return {Dummy_Scalar::neg(point.x)}; }

-  friend HOST_DEVICE_INLINE Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2)
+  friend Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2)
  {
    return {p1.x + p2.x};
  }

-  // friend HOST_DEVICE_INLINE Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) {
+  // friend Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) {
  //   return p1 + neg(p2);
  // }

-  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point)
+  friend std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point)
  {
    os << point.x;
    return os;
  }

-  friend HOST_DEVICE_INLINE Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point)
+  friend Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point)
  {
    Dummy_Projective res = zero();
 #ifdef CUDA_ARCH
@@ -95,14 +95,14 @@ public:
    return res;
  }

-  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2)
+  friend bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2)
  {
    return (p1.x == p2.x);
  }

-  static HOST_DEVICE_INLINE bool is_zero(const Dummy_Projective& point) { return point.x == 0; }
+  static bool is_zero(const Dummy_Projective& point) { return point.x == 0; }

-  static HOST_INLINE Dummy_Projective rand_host()
+  static Dummy_Projective rand_host()
  {
    return {(unsigned)rand() % 10};
    // return {(unsigned)rand()};
--- a/icicle/src/ntt/CMakeLists.txt
+++ b/icicle/src/ntt/CMakeLists.txt
@@ -0,0 +1,34 @@
+set(TARGET icicle_ntt)
+set(CURVE_TARGET icicle_curve)
+set(FIELD_TARGET icicle_field)
+
+set(SRC ../)
+
+set(NTT_SOURCE ${SRC}/ntt/extern.cpp)
+set(NTT_SOURCE_EXTENSION ${SRC}/ntt/extern_extension.cpp)
+set(NTT_SOURCE_EC ${SRC}/ntt/extern_ecntt.cpp)
+set(NTT_SOURCE ${SRC}/ntt/extern.cpp)
+if(G2)
+  list(APPEND NTT_SOURCE ${SRC}/ntt/extern_g2.cpp)
+endif()
+# if(MSM)
+#   list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
+#   if(G2)
+#     list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
+#   endif()
+# endif()
+# if(ECNTT)
+#   list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
+#   list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
+# endif()
+
+add_library(${TARGET} STATIC ${NTT_SOURCE})
+add_library(${TARGET} STATIC ${NTT_SOURCE_EXTENSION})
+add_library(${TARGET} STATIC ${NTT_SOURCE_EC})
+add_library(${TARGET} STATIC )
+target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
+target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
+target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
+target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
+target_compile_features(${TARGET} PUBLIC cxx_std_17)
--- a/icicle/src/ntt/Makefile
+++ b/icicle/src/ntt/Makefile
@@ -1,13 +1,21 @@
 build_verification:
 	mkdir -p work
-	nvcc -o work/test_verification -I. -I../../include tests/verification.cu -std=c++17
+	g++ -o work/test_verification -Intt.cpp -Ikernel_ntt.cpp -Iextern.cpp -Iextern_ecntt.cpp -I ectern_extension.cpp -I../../include tests/verification.cpp -std=c++17
+
+build_extern:
+	g++ -o work/test_verification -I../../include extern.cpp -std=c++17
+
+
+extern.o: extern.cpp
+    g++ -std=c++17 -std=gnu++11 -c -o $@ $< -I../../include
+

 test_verification: build_verification
 	work/test_verification

 build_verification_ecntt:
 	mkdir -p work
-	nvcc -o work/test_verification_ecntt -I. -I../../include tests/verification.cu -std=c++17 -DECNTT
+	g++ -o work/test_verification_ecntt -I. -I../../include tests/verification.cpp -std=c++17 -DECNTT

 test_verification_ecntt: build_verification_ecntt
 	work/test_verification_ecntt
--- a/icicle/src/ntt/extern.cpp
+++ b/icicle/src/ntt/extern.cpp
@@ -0,0 +1,60 @@
+#define FIELD_ID BN254
+#include "../../include/fields/field_config.cuh"
+
+using namespace field_config;
+
+#include "ntt.cpp"
+
+#include "../../include/gpu-utils/device_context.cuh"
+#include "../../include/utils/utils.h"
+
+typedef int cudaError_t;
+namespace ntt {
+  /**
+   * Extern "C" version of [init_domain](@ref init_domain) function with the following
+   * value of template parameter (where the field is given by `-DFIELD` env variable during build):
+   *  - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
+   * stand-alone "STARK field";
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, initialize_domain)(
+    scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
+  {
+    return 0;
+  }
+
+  /**
+   * Extern "C" version of [ntt](@ref ntt) function with the following values of template parameters
+   * (where the field is given by `-DFIELD` env variable during build):
+   *  - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
+   * stand-alone "STARK field";
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, ntt_cuda)(
+    const scalar_t* input, int size, NTTDir dir, NTTConfig<scalar_t>& config, scalar_t* output)
+  {
+    return ntt<scalar_t, scalar_t>(input, size, dir, config, output);
+  }
+
+  /**
+   * Extern "C" version of [release_domain](@ref release_domain) function with the following values of template
+   * parameters (where the field is given by `-DFIELD` env variable during build):
+   *  - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
+   * stand-alone "STARK field";
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, release_domain)(device_context::DeviceContext& ctx)
+  {
+    return release_domain<scalar_t>(ctx);
+  }
+
+  /**
+   * Extern "C" version of [get_root_of_unity](@ref get_root_of_unity) function with the following
+   * value of template parameter (where the field is given by `-DFIELD` env variable during build):
+   *  - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
+   * stand-alone "STARK field";
+   */
+  extern "C" scalar_t CONCAT_EXPAND(FIELD, get_root_of_unity)(uint32_t logn)
+  {
+    return get_root_of_unity<scalar_t>(logn);
+  }
+} // namespace ntt
--- a/icicle/src/ntt/extern_ecntt.cpp
+++ b/icicle/src/ntt/extern_ecntt.cpp
@@ -0,0 +1,28 @@
+
+#define FIELD_ID BN254
+#define CURVE_ID BN254
+#include "../../include/curves/curve_config.cuh"
+#include "../../include/fields/field_config.cuh"
+
+using namespace curve_config;
+using namespace field_config;
+
+#include "ntt.cpp"
+
+#include "../../include/gpu-utils/device_context.cuh"
+#include "../../include/utils/utils.h"
+
+namespace ntt {
+  /**
+   * Extern "C" version of [ntt](@ref ntt) function with the following values of template parameters
+   * (where the curve is given by `-DCURVE` env variable during build):
+   *  - `S` is the [projective representation](@ref projective_t) of the curve (i.e. EC NTT is computed);
+   *  - `E` is the [scalar field](@ref scalar_t) of the curve;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, ecntt_cuda)(
+    const projective_t* input, int size, NTTDir dir, NTTConfig<scalar_t>& config, projective_t* output)
+  {
+    return ntt<scalar_t, projective_t>(input, size, dir, config, output);
+  }
+} // namespace ntt
--- a/icicle/src/ntt/extern_extension.cpp
+++ b/icicle/src/ntt/extern_extension.cpp
@@ -0,0 +1,24 @@
+#define FIELD_ID BABY_BEAR
+#include "../../include/fields/field_config.cuh"
+
+using namespace field_config;
+
+#include "ntt.cpp"
+
+#include "../../include/gpu-utils/device_context.cuh"
+#include "../../include/utils/utils.h"
+
+namespace ntt {
+  /**
+   * Extern "C" version of [ntt](@ref ntt) function with the following values of template parameters
+   * (where the field is given by `-DFIELD` env variable during build):
+   *  - `E` is the [field](@ref scalar_t);
+   *  - `S` is the [extension](@ref extension_t) of `E` of appropriate degree;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_ntt_cuda)(
+    const extension_t* input, int size, NTTDir dir, NTTConfig<scalar_t>& config, extension_t* output)
+  {
+    return ntt<scalar_t, extension_t>(input, size, dir, config, output);
+  }
+} // namespace ntt
--- a/icicle/src/ntt/kernel_ntt.cpp
+++ b/icicle/src/ntt/kernel_ntt.cpp
--- a/icicle/src/ntt/kernel_ntt.cu
+++ b/icicle/src/ntt/kernel_ntt.cu
@@ -8,7 +8,7 @@ using namespace field_config;

 namespace mxntt {

-  static inline __device__ uint32_t dig_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw)
+  static inline uint32_t dig_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw)
  {
    uint32_t rev_num = 0, temp, dig_len;
    if (dit) {
@@ -31,11 +31,11 @@ namespace mxntt {
    return rev_num;
  }

-  static inline __device__ uint32_t bit_rev(uint32_t num, uint32_t log_size) { return __brev(num) >> (32 - log_size); }
+  static inline uint32_t bit_rev(uint32_t num, uint32_t log_size) { return __brev(num) >> (32 - log_size); }

  enum eRevType { None, RevToMixedRev, MixedRevToRev, NaturalToMixedRev, NaturalToRev, MixedRevToNatural };

-  static __device__ uint32_t generalized_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw, eRevType rev_type)
+  static uint32_t generalized_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw, eRevType rev_type)
  {
    switch (rev_type) {
    case eRevType::RevToMixedRev:
--- a/icicle/src/ntt/ntt.cpp
+++ b/icicle/src/ntt/ntt.cpp
@@ -0,0 +1,338 @@
+
+
+#define FIELD_ID BN254
+#include "../../include/fields/field_config.cuh"
+
+
+using namespace field_config;
+
+#include "../../include/ntt/ntt.cuh"
+
+#include <unordered_map>
+#include <vector>
+#include <type_traits>
+
+#include "../../include/gpu-utils/sharedmem.cuh"
+#include "../../include/utils/utils_kernels.cuh"
+#include "../../include/utils/utils.h"
+#include "../../include/ntt/ntt_impl.cuh"
+#include "../../include/gpu-utils/device_context.cuh"
+
+#include <mutex>
+
+#ifdef CURVE_ID
+#include "../../include/curves/curve_config.cuh"
+using namespace curve_config;
+#define IS_ECNTT std::is_same_v<E, projective_t>
+#else
+#define IS_ECNTT false
+#endif
+
+namespace ntt {
+
+  namespace {
+    // TODO: Set MAX THREADS based on GPU arch
+    const uint32_t MAX_NUM_THREADS = 512; // TODO: hotfix - should be 1024, currently limits shared memory size
+    const uint32_t MAX_THREADS_BATCH = 512;
+    const uint32_t MAX_THREADS_BATCH_ECNTT =
+      128; // TODO: hardcoded - allows (2^18 x 64) ECNTT for sm86, decrease this to allow larger ecntt length, batch
+           // size limited by on-device memory
+    const uint32_t MAX_SHARED_MEM_ELEMENT_SIZE = 32; // TODO: occupancy calculator, hardcoded for sm_86..sm_89
+    const uint32_t MAX_SHARED_MEM = MAX_SHARED_MEM_ELEMENT_SIZE * MAX_NUM_THREADS;
+
+    template <typename E>
+    void reverse_order_kernel(const E* arr, E* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size)
+    {
+      return;
+    }
+
+    /**
+     * Bit-reverses a batch of input arrays out-of-place inside GPU.
+     * for example: on input array ([a[0],a[1],a[2],a[3]], 4, 2) it returns
+     * [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places).
+     * @param arr_in batch of arrays of some object of type T. Should be on GPU.
+     * @param n length of `arr`.
+     * @param logn log(n).
+     * @param batch_size the size of the batch.
+     * @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into.
+     */
+    template <typename E>
+    void reverse_order_batch(
+      const E* arr_in, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream, E* arr_out)
+    {
+      return;
+    }
+
+    /**
+     * Bit-reverses an input array out-of-place inside GPU.
+     * for example: on array ([a[0],a[1],a[2],a[3]], 4, 2) it returns
+     * [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places).
+     * @param arr_in array of some object of type T of size which is a power of 2. Should be on GPU.
+     * @param n length of `arr`.
+     * @param logn log(n).
+     * @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into.
+     */
+    template <typename E>
+    void reverse_order(const E* arr_in, uint32_t n, uint32_t logn, cudaStream_t stream, E* arr_out)
+    {
+      reverse_order_batch(arr_in, n, logn, 1, stream, arr_out);
+    }
+
+    /**
+     * Cooley-Tuckey NTT.
+     * NOTE! this function assumes that d_twiddles are located in the device memory.
+     * @param arr_in input array of type E (elements).
+     * @param n length of d_arr.
+     * @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
+     * @param n_twiddles length of twiddles, should be negative for intt.
+     * @param max_task max count of parallel tasks.
+     * @param s log2(n) loop index.
+     * @param arr_out buffer for the output.
+     */
+    template <typename E, typename S>
+    void ntt_template_kernel_shared_rev(
+      const E* __restrict__ arr_in,
+      int n,
+      const S* __restrict__ r_twiddles,
+      int n_twiddles,
+      int max_task,
+      int ss,
+      int logn,
+      E* __restrict__ arr_out)
+    {
+      return;
+    }
+
+    /**
+     * Cooley-Tuckey NTT.
+     * NOTE! this function assumes that d_twiddles are located in the device memory.
+     * @param arr_in input array of type E (elements).
+     * @param n length of d_arr.
+     * @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
+     * @param n_twiddles length of twiddles, should be negative for intt.
+     * @param max_task max count of parallel tasks.
+     * @param s log2(n) loop index.
+     * @param arr_out buffer for the output.
+     */
+    template <typename E, typename S>
+    void ntt_template_kernel_shared(
+      const E* __restrict__ arr_in,
+      int n,
+      const S* __restrict__ r_twiddles,
+      int n_twiddles,
+      int max_task,
+      int s,
+      int logn,
+      E* __restrict__ arr_out)
+    {
+      return;
+    }
+
+    /**
+     * Cooley-Tukey NTT.
+     * NOTE! this function assumes that d_twiddles are located in the device memory.
+     * @param arr input array of type E (elements).
+     * @param n length of d_arr.
+     * @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
+     * @param n_twiddles length of twiddles, should be negative for intt.
+     * @param max_task max count of parallel tasks.
+     * @param s log2(n) loop index.
+     */
+    template <typename E, typename S>
+    void
+    ntt_template_kernel(const E* arr_in, int n, S* twiddles, int n_twiddles, int max_task, int s, bool rev, E* arr_out)
+    {
+      return;
+    }
+
+    /**
+     * NTT/INTT inplace batch
+     * Note: this function does not perform any bit-reverse permutations on its inputs or outputs.
+     * @param d_input Input array
+     * @param n Size of `d_input`
+     * @param d_twiddles Twiddles
+     * @param n_twiddles Size of `d_twiddles`
+     * @param batch_size The size of the batch; the length of `d_inout` is `n` * `batch_size`.
+     * @param inverse true for iNTT
+     * @param coset should be array of length n or a nullptr if NTT is not computed on a coset
+     * @param stream CUDA stream
+     * @param is_async if false, perform sync of the supplied CUDA stream at the end of processing
+     * @param d_output Output array
+     */
+    template <typename E, typename S>
+    cudaError_t ntt_inplace_batch_template(
+      const E* d_input,
+      int n,
+      S* d_twiddles,
+      int n_twiddles,
+      int batch_size,
+      int logn,
+      bool inverse,
+      bool dit,
+      S* arbitrary_coset,
+      int coset_gen_index,
+      cudaStream_t stream,
+      E* d_output)
+    {
+      return 0;
+    }
+  } // namespace
+
+  /**
+   * @struct Domain
+   * Struct containing information about the domain on which (i)NTT is evaluated i.e. twiddle factors.
+   * Twiddle factors are private, static and can only be set using [init_domain](@ref init_domain) function.
+   * The internal representation of twiddles is prone to change in accordance with changing [NTT](@ref NTT) algorithm.
+   * @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$. Must be a field.
+   */
+  template <typename S>
+  class Domain
+  {
+    // Mutex for protecting access to the domain/device container array
+    static inline std::mutex device_domain_mutex;
+    // The domain-per-device container - assumption is init_domain is called once per device per program.
+
+    int max_size = 0;
+    int max_log_size = 0;
+    S* twiddles = nullptr;
+    bool initialized = false; // protection for multi-threaded case
+    std::unordered_map<S, int> coset_index = {};
+
+    S* internal_twiddles = nullptr; // required by mixed-radix NTT
+    S* basic_twiddles = nullptr;    // required by mixed-radix NTT
+
+    // mixed-radix NTT supports a fast-twiddle option at the cost of additional 4N memory (where N is max NTT size)
+    S* fast_external_twiddles = nullptr;     // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_internal_twiddles = nullptr;     // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_basic_twiddles = nullptr;        // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_external_twiddles_inv = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_internal_twiddles_inv = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_basic_twiddles_inv = nullptr;    // required by mixed-radix NTT (fast-twiddles mode)
+
+  public:
+    template <typename U>
+    friend cudaError_t init_domain(U primitive_root, device_context::DeviceContext& ctx, bool fast_tw);
+
+    template <typename U>
+    friend cudaError_t release_domain(device_context::DeviceContext& ctx);
+
+    template <typename U>
+    friend U get_root_of_unity(uint64_t logn, device_context::DeviceContext& ctx);
+
+    template <typename U>
+    friend U get_root_of_unity_from_domain(uint64_t logn, device_context::DeviceContext& ctx);
+
+    template <typename U, typename E>
+    friend cudaError_t ntt(const E* input, int size, NTTDir dir, NTTConfig<U>& config, E* output);
+  };
+
+  template <typename S>
+  // static inline Domain<S> domains_for_devices[device_context::MAX_DEVICES] = {};
+  static inline Domain<S> domains_for_devices[1] = {};
+
+  template <typename S>
+  cudaError_t init_domain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
+  {
+    return 0;
+  }
+
+  template <typename S>
+  cudaError_t release_domain(device_context::DeviceContext& ctx)
+  {
+    return 0;
+  }
+
+  template <typename S>
+  S get_root_of_unity(uint64_t max_size)
+  {
+    // ceil up
+    const auto log_max_size = static_cast<uint32_t>(std::ceil(std::log2(max_size)));
+    return S::omega(log_max_size);
+  }
+  // explicit instantiation to avoid having to include this file
+  template scalar_t get_root_of_unity(uint64_t logn);
+
+  template <typename S>
+  S get_root_of_unity_from_domain(uint64_t logn, device_context::DeviceContext& ctx)
+  {
+    Domain<S>& domain = domains_for_devices<S>[ctx.device_id];
+    if (logn > domain.max_log_size) {
+      std::ostringstream oss;
+      oss << "NTT log_size=" << logn
+          << " is too large for the domain. Consider generating your domain with a higher order root of unity.\n";
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, oss.str().c_str());
+    }
+    const size_t twiddles_idx = 1ULL << (domain.max_log_size - logn);
+    return domain.twiddles[twiddles_idx];
+  }
+  // explicit instantiation to avoid having to include this file
+  template scalar_t get_root_of_unity_from_domain(uint64_t logn, device_context::DeviceContext& ctx);
+
+  template <typename S>
+  static bool is_choosing_radix2_algorithm(int logn, int batch_size, const NTTConfig<S>& config)
+  {
+    const bool is_mixed_radix_alg_supported = (logn > 3 && logn != 7);
+    if (!is_mixed_radix_alg_supported && config.columns_batch)
+      throw IcicleError(IcicleError_t::InvalidArgument, "columns batch is not supported for given NTT size");
+    const bool is_user_selected_radix2_alg = config.ntt_algorithm == NttAlgorithm::Radix2;
+    const bool is_force_radix2 = !is_mixed_radix_alg_supported || is_user_selected_radix2_alg;
+    if (is_force_radix2) return true;
+
+    const bool is_user_selected_mixed_radix_alg = config.ntt_algorithm == NttAlgorithm::MixedRadix;
+    if (is_user_selected_mixed_radix_alg) return false;
+    if (config.columns_batch) return false; // radix2 does not currently support columns batch mode.
+
+    // Heuristic to automatically select an algorithm
+    // Note that generally the decision depends on {logn, batch, ordering, inverse, coset, in-place, coeff-field} and
+    // the specific GPU.
+    // the following heuristic is a simplification based on measurements. Users can try both and select the algorithm
+    // based on the specific case via the 'NTTConfig.ntt_algorithm' field
+
+    if (logn >= 16) return false; // mixed-radix is typically faster in those cases
+    if (logn <= 11) return true;  //  radix-2 is typically faster for batch<=256 in those cases
+    const int log_batch = (int)log2(batch_size);
+    return (logn + log_batch <= 18); // almost the cutoff point where both are equal
+  }
+
+  template <typename S, typename E>
+  cudaError_t radix2_ntt(
+    const E* d_input,
+    E* d_output,
+    S* twiddles,
+    int ntt_size,
+    int max_size,
+    int batch_size,
+    bool is_inverse,
+    Ordering ordering,
+    S* arbitrary_coset,
+    int coset_gen_index,
+    cudaStream_t cuda_stream)
+  {
+    return 0;
+  }
+
+  template <typename S, typename E>
+  cudaError_t ntt(const E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output)
+  {
+    return 0;
+  }
+
+  template <typename S>
+  NTTConfig<S> default_ntt_config(const device_context::DeviceContext& ctx)
+  {
+    NTTConfig<S> config = {
+      ctx,                // ctx
+      S::one(),           // coset_gen
+      1,                  // batch_size
+      false,              // columns_batch
+      Ordering::kNN,      // ordering
+      false,              // are_inputs_on_device
+      false,              // are_outputs_on_device
+      false,              // is_async
+      NttAlgorithm::Auto, // ntt_algorithm
+    };
+    return config;
+  }
+  // explicit instantiation to avoid having to include this file
+  template NTTConfig<scalar_t> default_ntt_config(const device_context::DeviceContext& ctx);
+} // namespace ntt
--- a/icicle/src/ntt/tests/verification.cpp
+++ b/icicle/src/ntt/tests/verification.cpp
@@ -0,0 +1,46 @@
+#include "../../../include/fields/id.h"
+#define FIELD_ID BN254
+
+#ifdef ECNTT
+#define CURVE_ID BN254
+#include "../../../include/curves/curve_config.cuh"
+typedef field_config::scalar_t test_scalar;
+typedef curve_config::projective_t test_data;
+#else
+#include "../../../include/fields/field_config.cuh"
+typedef field_config::scalar_t test_scalar;
+typedef field_config::scalar_t test_data;
+#endif
+
+#include "../../../include/fields/field.cuh"
+#include "../../../include/curves/projective.cuh"
+#include <chrono>
+#include <iostream>
+#include <vector>
+
+#include "../ntt.cpp"
+// #include "../kernel_ntt.cpp"
+#include <memory>
+
+void random_samples(test_data* res, uint32_t count)
+{
+  for (int i = 0; i < count; i++)
+    res[i] = i < 1000 ? test_data::rand_host() : res[i - 1000];
+}
+
+void incremental_values(test_scalar* res, uint32_t count)
+{
+  for (int i = 0; i < count; i++) {
+    res[i] = i ? res[i - 1] + test_scalar::one() : test_scalar::zero();
+  }
+}
+
+void transpose_batch(test_scalar* in, test_scalar* out, int row_size, int column_size)
+{
+  return;
+}
+
+int main(int argc, char** argv)
+{
+  return 0;
+}
--- a/icicle/src/ntt/tests/verification.cu
+++ b/icicle/src/ntt/tests/verification.cu
@@ -1,207 +0,0 @@
-#include "fields/id.h"
-#define FIELD_ID BN254
-
-#ifdef ECNTT
-#define CURVE_ID BN254
-#include "curves/curve_config.cuh"
-typedef field_config::scalar_t test_scalar;
-typedef curve_config::projective_t test_data;
-#else
-#include "fields/field_config.cuh"
-typedef field_config::scalar_t test_scalar;
-typedef field_config::scalar_t test_data;
-#endif
-
-#include "fields/field.cuh"
-#include "curves/projective.cuh"
-#include <chrono>
-#include <iostream>
-#include <vector>
-
-#include "ntt.cu"
-#include "kernel_ntt.cu"
-#include <memory>
-
-void random_samples(test_data* res, uint32_t count)
-{
-  for (int i = 0; i < count; i++)
-    res[i] = i < 1000 ? test_data::rand_host() : res[i - 1000];
-}
-
-void incremental_values(test_scalar* res, uint32_t count)
-{
-  for (int i = 0; i < count; i++) {
-    res[i] = i ? res[i - 1] + test_scalar::one() : test_scalar::zero();
-  }
-}
-
-__global__ void transpose_batch(test_scalar* in, test_scalar* out, int row_size, int column_size)
-{
-  int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  if (tid >= row_size * column_size) return;
-  out[(tid % row_size) * column_size + (tid / row_size)] = in[tid];
-}
-
-int main(int argc, char** argv)
-{
-  cudaEvent_t icicle_start, icicle_stop, new_start, new_stop;
-  float icicle_time, new_time;
-
-  int NTT_LOG_SIZE = (argc > 1) ? atoi(argv[1]) : 19;
-  int NTT_SIZE = 1 << NTT_LOG_SIZE;
-  bool INPLACE = (argc > 2) ? atoi(argv[2]) : false;
-  int INV = (argc > 3) ? atoi(argv[3]) : false;
-  int BATCH_SIZE = (argc > 4) ? atoi(argv[4]) : 150;
-  bool COLUMNS_BATCH = (argc > 5) ? atoi(argv[5]) : false;
-  int COSET_IDX = (argc > 6) ? atoi(argv[6]) : 2;
-  const ntt::Ordering ordering = (argc > 7) ? ntt::Ordering(atoi(argv[7])) : ntt::Ordering::kNN;
-  bool FAST_TW = (argc > 8) ? atoi(argv[8]) : true;
-
-  // Note: NM, MN are not expected to be equal when comparing mixed-radix and radix-2 NTTs
-  const char* ordering_str = ordering == ntt::Ordering::kNN   ? "NN"
-                             : ordering == ntt::Ordering::kNR ? "NR"
-                             : ordering == ntt::Ordering::kRN ? "RN"
-                             : ordering == ntt::Ordering::kRR ? "RR"
-                             : ordering == ntt::Ordering::kNM ? "NM"
-                                                              : "MN";
-
-  printf(
-    "running ntt 2^%d, inplace=%d, inverse=%d, batch_size=%d, columns_batch=%d coset-idx=%d, ordering=%s, fast_tw=%d\n",
-    NTT_LOG_SIZE, INPLACE, INV, BATCH_SIZE, COLUMNS_BATCH, COSET_IDX, ordering_str, FAST_TW);
-
-  CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context (warmup)
-
-  // init domain
-  auto ntt_config = ntt::default_ntt_config<test_scalar>();
-  ntt_config.ordering = ordering;
-  ntt_config.are_inputs_on_device = true;
-  ntt_config.are_outputs_on_device = true;
-  ntt_config.batch_size = BATCH_SIZE;
-  ntt_config.columns_batch = COLUMNS_BATCH;
-
-  CHK_IF_RETURN(cudaEventCreate(&icicle_start));
-  CHK_IF_RETURN(cudaEventCreate(&icicle_stop));
-  CHK_IF_RETURN(cudaEventCreate(&new_start));
-  CHK_IF_RETURN(cudaEventCreate(&new_stop));
-
-  auto start = std::chrono::high_resolution_clock::now();
-  const scalar_t basic_root = test_scalar::omega(NTT_LOG_SIZE);
-  ntt::init_domain(basic_root, ntt_config.ctx, FAST_TW);
-  auto stop = std::chrono::high_resolution_clock::now();
-  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
-  std::cout << "initDomain took: " << duration / 1000 << " MS" << std::endl;
-
-  // cpu allocation
-  auto CpuScalars = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
-  auto CpuOutputOld = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
-  auto CpuOutputNew = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
-
-  // gpu allocation
-  scalar_t *GpuScalars, *GpuOutputOld, *GpuOutputNew;
-  scalar_t* GpuScalarsTransposed;
-  CHK_IF_RETURN(cudaMalloc(&GpuScalars, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
-  CHK_IF_RETURN(cudaMalloc(&GpuScalarsTransposed, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
-  CHK_IF_RETURN(cudaMalloc(&GpuOutputOld, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
-  CHK_IF_RETURN(cudaMalloc(&GpuOutputNew, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
-
-  // init inputs
-  // incremental_values(CpuScalars.get(), NTT_SIZE * BATCH_SIZE);
-  random_samples(CpuScalars.get(), NTT_SIZE * BATCH_SIZE);
-  CHK_IF_RETURN(
-    cudaMemcpy(GpuScalars, CpuScalars.get(), NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyHostToDevice));
-
-  if (COLUMNS_BATCH) {
-    transpose_batch<<<(NTT_SIZE * BATCH_SIZE + 256 - 1) / 256, 256>>>(
-      GpuScalars, GpuScalarsTransposed, NTT_SIZE, BATCH_SIZE);
-  }
-
-  // inplace
-  if (INPLACE) {
-    CHK_IF_RETURN(cudaMemcpy(
-      GpuOutputNew, COLUMNS_BATCH ? GpuScalarsTransposed : GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data),
-      cudaMemcpyDeviceToDevice));
-  }
-
-  for (int coset_idx = 0; coset_idx < COSET_IDX; ++coset_idx) {
-    ntt_config.coset_gen = ntt_config.coset_gen * basic_root;
-  }
-
-  auto benchmark = [&](bool is_print, int iterations) -> cudaError_t {
-    // NEW
-    CHK_IF_RETURN(cudaEventRecord(new_start, ntt_config.ctx.stream));
-    ntt_config.ntt_algorithm = ntt::NttAlgorithm::MixedRadix;
-    for (size_t i = 0; i < iterations; i++) {
-      CHK_IF_RETURN(ntt::ntt(
-        INPLACE         ? GpuOutputNew
-        : COLUMNS_BATCH ? GpuScalarsTransposed
-                        : GpuScalars,
-        NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config, GpuOutputNew));
-    }
-    CHK_IF_RETURN(cudaEventRecord(new_stop, ntt_config.ctx.stream));
-    CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
-    CHK_IF_RETURN(cudaEventElapsedTime(&new_time, new_start, new_stop));
-
-    // OLD
-    CHK_IF_RETURN(cudaEventRecord(icicle_start, ntt_config.ctx.stream));
-    ntt_config.ntt_algorithm = ntt::NttAlgorithm::Radix2;
-    for (size_t i = 0; i < iterations; i++) {
-      CHK_IF_RETURN(
-        ntt::ntt(GpuScalars, NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config, GpuOutputOld));
-    }
-    CHK_IF_RETURN(cudaEventRecord(icicle_stop, ntt_config.ctx.stream));
-    CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
-    CHK_IF_RETURN(cudaEventElapsedTime(&icicle_time, icicle_start, icicle_stop));
-
-    if (is_print) {
-      printf("Old Runtime=%0.3f MS\n", icicle_time / iterations);
-      printf("New Runtime=%0.3f MS\n", new_time / iterations);
-    }
-
-    return CHK_LAST();
-  };
-
-  CHK_IF_RETURN(benchmark(false /*=print*/, 1)); // warmup
-  int count = INPLACE ? 1 : 10;
-  if (INPLACE) {
-    CHK_IF_RETURN(cudaMemcpy(
-      GpuOutputNew, COLUMNS_BATCH ? GpuScalarsTransposed : GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data),
-      cudaMemcpyDeviceToDevice));
-  }
-  CHK_IF_RETURN(benchmark(true /*=print*/, count));
-
-  if (COLUMNS_BATCH) {
-    transpose_batch<<<(NTT_SIZE * BATCH_SIZE + 256 - 1) / 256, 256>>>(
-      GpuOutputNew, GpuScalarsTransposed, BATCH_SIZE, NTT_SIZE);
-    CHK_IF_RETURN(cudaMemcpy(
-      GpuOutputNew, GpuScalarsTransposed, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToDevice));
-  }
-
-  // verify
-  CHK_IF_RETURN(
-    cudaMemcpy(CpuOutputNew.get(), GpuOutputNew, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToHost));
-  CHK_IF_RETURN(
-    cudaMemcpy(CpuOutputOld.get(), GpuOutputOld, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToHost));
-
-  bool success = true;
-  for (int i = 0; i < NTT_SIZE * BATCH_SIZE; i++) {
-    // if (i%64==0) printf("\n");
-    if (CpuOutputNew[i] != CpuOutputOld[i]) {
-      success = false;
-      // std::cout << i << " ref " << CpuOutputOld[i] << " != " << CpuOutputNew[i] << std::endl;
-      // break;
-    } else {
-      // std::cout << i << " ref " << CpuOutputOld[i] << " == " << CpuOutputNew[i] << std::endl;
-      // break;
-    }
-  }
-  const char* success_str = success ? "SUCCESS!" : "FAIL!";
-  printf("%s\n", success_str);
-
-  CHK_IF_RETURN(cudaFree(GpuScalars));
-  CHK_IF_RETURN(cudaFree(GpuOutputOld));
-  CHK_IF_RETURN(cudaFree(GpuOutputNew));
-
-  ntt::release_domain<test_scalar>(ntt_config.ctx);
-
-  return CHK_LAST();
-}
--- a/icicle/src/ntt/thread_ntt.cu
+++ b/icicle/src/ntt/thread_ntt.cu
@@ -1,721 +0,0 @@
-#ifndef T_NTT
-#define T_NTT
-#pragma once
-
-#include <stdio.h>
-#include <stdint.h>
-#include "gpu-utils/modifiers.cuh"
-
-struct stage_metadata {
-  uint32_t th_stride;
-  uint32_t ntt_block_size;
-  uint32_t batch_id;
-  uint32_t ntt_block_id;
-  uint32_t ntt_inp_id;
-};
-
-#define STAGE_SIZES_DATA                                                                                               \
-  {                                                                                                                    \
-    {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0},              \
-      {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0},            \
-      {6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0},            \
-      {6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0},            \
-      {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6},            \
-      {6, 6, 6, 6, 6},                                                                                                 \
-  }
-uint32_t constexpr STAGE_SIZES_HOST[31][5] = STAGE_SIZES_DATA;
-__device__ constexpr uint32_t STAGE_SIZES_DEVICE[31][5] = STAGE_SIZES_DATA;
-
-// construction for fast-twiddles
-uint32_t constexpr STAGE_PREV_SIZES[31] = {0,  0,  0,  0,  0,  0,  0,  0,  4,  5,  5,  6,  6,  9,  9, 10,
-                                           11, 11, 12, 15, 15, 16, 16, 18, 18, 20, 21, 21, 22, 23, 24};
-
-#define STAGE_SIZES_DATA_FAST_TW                                                                                       \
-  {                                                                                                                    \
-    {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0},              \
-      {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0},            \
-      {6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0},            \
-      {6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0},            \
-      {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6},            \
-      {6, 6, 6, 6, 6},                                                                                                 \
-  }
-uint32_t constexpr STAGE_SIZES_HOST_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;
-__device__ uint32_t constexpr STAGE_SIZES_DEVICE_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;
-
-template <typename E, typename S>
-class NTTEngine
-{
-public:
-  E X[8];
-  S WB[3];
-  S WI[7];
-  S WE[8];
-
-  DEVICE_INLINE void loadBasicTwiddles(S* basic_twiddles)
-  {
-    UNROLL
-    for (int i = 0; i < 3; i++) {
-      WB[i] = basic_twiddles[i];
-    }
-  }
-
-  DEVICE_INLINE void loadBasicTwiddlesGeneric(S* basic_twiddles, bool inv)
-  {
-    UNROLL
-    for (int i = 0; i < 3; i++) {
-      WB[i] = basic_twiddles[inv ? i + 3 : i];
-    }
-  }
-
-  DEVICE_INLINE void loadInternalTwiddles64(S* data, bool stride)
-  {
-    UNROLL
-    for (int i = 0; i < 7; i++) {
-      WI[i] = data[((stride ? (threadIdx.x >> 3) : (threadIdx.x)) & 0x7) * (i + 1)];
-    }
-  }
-
-  DEVICE_INLINE void loadInternalTwiddles32(S* data, bool stride)
-  {
-    UNROLL
-    for (int i = 0; i < 7; i++) {
-      WI[i] = data[2 * ((stride ? (threadIdx.x >> 4) : (threadIdx.x)) & 0x3) * (i + 1)];
-    }
-  }
-
-  DEVICE_INLINE void loadInternalTwiddles16(S* data, bool stride)
-  {
-    UNROLL
-    for (int i = 0; i < 7; i++) {
-      WI[i] = data[4 * ((stride ? (threadIdx.x >> 5) : (threadIdx.x)) & 0x1) * (i + 1)];
-    }
-  }
-
-  DEVICE_INLINE void loadInternalTwiddlesGeneric64(S* data, bool stride, bool inv)
-  {
-    UNROLL
-    for (int i = 0; i < 7; i++) {
-      uint32_t exp = ((stride ? (threadIdx.x >> 3) : (threadIdx.x)) & 0x7) * (i + 1);
-      WI[i] = data[(inv && exp) ? 64 - exp : exp]; // if exp = 0 we also take exp and not 64-exp
-    }
-  }
-
-  DEVICE_INLINE void loadInternalTwiddlesGeneric32(S* data, bool stride, bool inv)
-  {
-    UNROLL
-    for (int i = 0; i < 7; i++) {
-      uint32_t exp = 2 * ((stride ? (threadIdx.x >> 4) : (threadIdx.x)) & 0x3) * (i + 1);
-      WI[i] = data[(inv && exp) ? 64 - exp : exp];
-    }
-  }
-
-  DEVICE_INLINE void loadInternalTwiddlesGeneric16(S* data, bool stride, bool inv)
-  {
-    UNROLL
-    for (int i = 0; i < 7; i++) {
-      uint32_t exp = 4 * ((stride ? (threadIdx.x >> 5) : (threadIdx.x)) & 0x1) * (i + 1);
-      WI[i] = data[(inv && exp) ? 64 - exp : exp];
-    }
-  }
-
-  DEVICE_INLINE void loadExternalTwiddles64(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
-  {
-    data += tw_order * s_meta.ntt_inp_id + (s_meta.ntt_block_id & (tw_order - 1));
-
-    UNROLL
-    for (uint32_t i = 0; i < 8; i++) {
-      WE[i] = data[8 * i * tw_order + (1 << tw_log_order + 6) - 1];
-    }
-  }
-
-  DEVICE_INLINE void loadExternalTwiddles32(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
-  {
-    data += tw_order * s_meta.ntt_inp_id * 2 + (s_meta.ntt_block_id & (tw_order - 1));
-
-    UNROLL
-    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 4; i++) {
-        WE[4 * j + i] = data[(8 * i + j) * tw_order + (1 << tw_log_order + 5) - 1];
-      }
-    }
-  }
-
-  DEVICE_INLINE void loadExternalTwiddles16(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
-  {
-    data += tw_order * s_meta.ntt_inp_id * 4 + (s_meta.ntt_block_id & (tw_order - 1));
-
-    UNROLL
-    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 2; i++) {
-        WE[2 * j + i] = data[(8 * i + j) * tw_order + (1 << tw_log_order + 4) - 1];
-      }
-    }
-  }
-
-  DEVICE_INLINE void loadExternalTwiddlesGeneric64(
-    S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
-  {
-    UNROLL
-    for (uint32_t i = 0; i < 8; i++) {
-      uint32_t exp = (s_meta.ntt_inp_id + 8 * i) * (s_meta.ntt_block_id & (tw_order - 1))
-                     << (tw_log_size - tw_log_order - 6);
-      WE[i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
-    }
-  }
-
-  DEVICE_INLINE void loadExternalTwiddlesGeneric32(
-    S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
-  {
-    UNROLL
-    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 4; i++) {
-        uint32_t exp = (s_meta.ntt_inp_id * 2 + 8 * i + j) * (s_meta.ntt_block_id & (tw_order - 1))
-                       << (tw_log_size - tw_log_order - 5);
-        WE[4 * j + i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
-      }
-    }
-  }
-
-  DEVICE_INLINE void loadExternalTwiddlesGeneric16(
-    S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
-  {
-    UNROLL
-    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 2; i++) {
-        uint32_t exp = (s_meta.ntt_inp_id * 4 + 8 * i + j) * (s_meta.ntt_block_id & (tw_order - 1))
-                       << (tw_log_size - tw_log_order - 4);
-        WE[2 * j + i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
-      }
-    }
-  }
-
-  DEVICE_INLINE void
-  loadGlobalData(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
-  {
-    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
-    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
-    }
-
-    UNROLL
-    for (uint32_t i = 0; i < 8; i++) {
-      X[i] = data[s_meta.th_stride * i * data_stride];
-    }
-  }
-
-  DEVICE_INLINE void loadGlobalDataColumnBatch(
-    const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-    UNROLL
-    for (uint32_t i = 0; i < 8; i++) {
-      X[i] = data[s_meta.th_stride * i * data_stride * batch_size];
-    }
-  }
-
-  DEVICE_INLINE void
-  storeGlobalData(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
-  {
-    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
-    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
-    }
-
-    UNROLL
-    for (uint32_t i = 0; i < 8; i++) {
-      data[s_meta.th_stride * i * data_stride] = X[i];
-    }
-  }
-
-  DEVICE_INLINE void storeGlobalDataColumnBatch(
-    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-    UNROLL
-    for (uint32_t i = 0; i < 8; i++) {
-      data[s_meta.th_stride * i * data_stride * batch_size] = X[i];
-    }
-  }
-
-  DEVICE_INLINE void
-  loadGlobalData32(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
-  {
-    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
-    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
-    }
-
-    UNROLL
-    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 4; i++) {
-        X[4 * j + i] = data[(8 * i + j) * data_stride];
-      }
-    }
-  }
-
-  DEVICE_INLINE void loadGlobalData32ColumnBatch(
-    const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-    UNROLL
-    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 4; i++) {
-        X[4 * j + i] = data[(8 * i + j) * data_stride * batch_size];
-      }
-    }
-  }
-
-  DEVICE_INLINE void
-  storeGlobalData32(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
-  {
-    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
-    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
-    }
-
-    UNROLL
-    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 4; i++) {
-        data[(8 * i + j) * data_stride] = X[4 * j + i];
-      }
-    }
-  }
-
-  DEVICE_INLINE void storeGlobalData32ColumnBatch(
-    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-    UNROLL
-    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 4; i++) {
-        data[(8 * i + j) * data_stride * batch_size] = X[4 * j + i];
-      }
-    }
-  }
-
-  DEVICE_INLINE void
-  loadGlobalData16(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
-  {
-    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
-    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
-    }
-
-    UNROLL
-    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 2; i++) {
-        X[2 * j + i] = data[(8 * i + j) * data_stride];
-      }
-    }
-  }
-
-  DEVICE_INLINE void loadGlobalData16ColumnBatch(
-    const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-    UNROLL
-    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 2; i++) {
-        X[2 * j + i] = data[(8 * i + j) * data_stride * batch_size];
-      }
-    }
-  }
-
-  DEVICE_INLINE void
-  storeGlobalData16(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
-  {
-    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
-    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
-    }
-
-    UNROLL
-    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 2; i++) {
-        data[(8 * i + j) * data_stride] = X[2 * j + i];
-      }
-    }
-  }
-
-  DEVICE_INLINE void storeGlobalData16ColumnBatch(
-    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-    UNROLL
-    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 2; i++) {
-        data[(8 * i + j) * data_stride * batch_size] = X[2 * j + i];
-      }
-    }
-  }
-
-  DEVICE_INLINE void ntt4_2()
-  {
-    UNROLL
-    for (int i = 0; i < 2; i++) {
-      ntt4(X[4 * i], X[4 * i + 1], X[4 * i + 2], X[4 * i + 3]);
-    }
-  }
-
-  DEVICE_INLINE void ntt2_4()
-  {
-    UNROLL
-    for (int i = 0; i < 4; i++) {
-      ntt2(X[2 * i], X[2 * i + 1]);
-    }
-  }
-
-  DEVICE_INLINE void ntt2(E& X0, E& X1)
-  {
-    E T;
-
-    T = X0 + X1;
-    X1 = X0 - X1;
-    X0 = T;
-  }
-
-  DEVICE_INLINE void ntt4(E& X0, E& X1, E& X2, E& X3)
-  {
-    E T;
-
-    T = X0 + X2;
-    X2 = X0 - X2;
-    X0 = X1 + X3;
-    X1 = X1 - X3; // T has X0, X0 has X1, X2 has X2, X1 has X3
-
-    X1 = X1 * WB[0];
-
-    X3 = X2 - X1;
-    X1 = X2 + X1;
-    X2 = T - X0;
-    X0 = T + X0;
-  }
-
-  // rbo version
-  DEVICE_INLINE void ntt4rbo(E& X0, E& X1, E& X2, E& X3)
-  {
-    E T;
-
-    T = X0 - X1;
-    X0 = X0 + X1;
-    X1 = X2 + X3;
-    X3 = X2 - X3; // T has X0, X0 has X1, X2 has X2, X1 has X3
-
-    X3 = X3 * WB[0];
-
-    X2 = X0 - X1;
-    X0 = X0 + X1;
-    X1 = T + X3;
-    X3 = T - X3;
-  }
-
-  DEVICE_INLINE void ntt8(E& X0, E& X1, E& X2, E& X3, E& X4, E& X5, E& X6, E& X7)
-  {
-    E T;
-
-    // out of 56,623,104 possible mappings, we have:
-    T = X3 - X7;
-    X7 = X3 + X7;
-    X3 = X1 - X5;
-    X5 = X1 + X5;
-    X1 = X2 + X6;
-    X2 = X2 - X6;
-    X6 = X0 + X4;
-    X0 = X0 - X4;
-
-    T = T * WB[1];
-    X2 = X2 * WB[1];
-
-    X4 = X6 + X1;
-    X6 = X6 - X1;
-    X1 = X3 + T;
-    X3 = X3 - T;
-    T = X5 + X7;
-    X5 = X5 - X7;
-    X7 = X0 + X2;
-    X0 = X0 - X2;
-
-    X1 = X1 * WB[0];
-    X5 = X5 * WB[1];
-    X3 = X3 * WB[2];
-
-    X2 = X6 + X5;
-    X6 = X6 - X5;
-    X5 = X7 - X1;
-    X1 = X7 + X1;
-    X7 = X0 - X3;
-    X3 = X0 + X3;
-    X0 = X4 + T;
-    X4 = X4 - T;
-  }
-
-  DEVICE_INLINE void ntt8win()
-  {
-    E T;
-
-    T = X[3] - X[7];
-    X[7] = X[3] + X[7];
-    X[3] = X[1] - X[5];
-    X[5] = X[1] + X[5];
-    X[1] = X[2] + X[6];
-    X[2] = X[2] - X[6];
-    X[6] = X[0] + X[4];
-    X[0] = X[0] - X[4];
-
-    X[2] = X[2] * WB[0];
-
-    X[4] = X[6] + X[1];
-    X[6] = X[6] - X[1];
-    X[1] = X[3] + T;
-    X[3] = X[3] - T;
-    T = X[5] + X[7];
-    X[5] = X[5] - X[7];
-    X[7] = X[0] + X[2];
-    X[0] = X[0] - X[2];
-
-    X[1] = X[1] * WB[1];
-    X[5] = X[5] * WB[0];
-    X[3] = X[3] * WB[2];
-
-    X[2] = X[6] + X[5];
-    X[6] = X[6] - X[5];
-
-    X[5] = X[1] + X[3];
-    X[3] = X[1] - X[3];
-
-    X[1] = X[7] + X[5];
-    X[5] = X[7] - X[5];
-    X[7] = X[0] - X[3];
-    X[3] = X[0] + X[3];
-    X[0] = X[4] + T;
-    X[4] = X[4] - T;
-  }
-
-  DEVICE_INLINE void SharedData64Columns8(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0x7 : threadIdx.x >> 3;
-    uint32_t column_id = stride ? threadIdx.x >> 3 : threadIdx.x & 0x7;
-
-    UNROLL
-    for (uint32_t i = 0; i < 8; i++) {
-      if (store) {
-        shmem[ntt_id * 64 + i * 8 + column_id] = X[i];
-      } else {
-        X[i] = shmem[ntt_id * 64 + i * 8 + column_id];
-      }
-    }
-  }
-
-  DEVICE_INLINE void SharedData64Rows8(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0x7 : threadIdx.x >> 3;
-    uint32_t row_id = stride ? threadIdx.x >> 3 : threadIdx.x & 0x7;
-
-    UNROLL
-    for (uint32_t i = 0; i < 8; i++) {
-      if (store) {
-        shmem[ntt_id * 64 + row_id * 8 + i] = X[i];
-      } else {
-        X[i] = shmem[ntt_id * 64 + row_id * 8 + i];
-      }
-    }
-  }
-
-  DEVICE_INLINE void SharedData32Columns8(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
-    uint32_t column_id = stride ? threadIdx.x >> 4 : threadIdx.x & 0x3;
-
-    UNROLL
-    for (uint32_t i = 0; i < 8; i++) {
-      if (store) {
-        shmem[ntt_id * 32 + i * 4 + column_id] = X[i];
-      } else {
-        X[i] = shmem[ntt_id * 32 + i * 4 + column_id];
-      }
-    }
-  }
-
-  DEVICE_INLINE void SharedData32Rows8(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
-    uint32_t row_id = stride ? threadIdx.x >> 4 : threadIdx.x & 0x3;
-
-    UNROLL
-    for (uint32_t i = 0; i < 8; i++) {
-      if (store) {
-        shmem[ntt_id * 32 + row_id * 8 + i] = X[i];
-      } else {
-        X[i] = shmem[ntt_id * 32 + row_id * 8 + i];
-      }
-    }
-  }
-
-  DEVICE_INLINE void SharedData32Columns4_2(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
-    uint32_t column_id = (stride ? threadIdx.x >> 4 : threadIdx.x & 0x3) * 2;
-
-    UNROLL
-    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 4; i++) {
-        if (store) {
-          shmem[ntt_id * 32 + i * 8 + column_id + j] = X[4 * j + i];
-        } else {
-          X[4 * j + i] = shmem[ntt_id * 32 + i * 8 + column_id + j];
-        }
-      }
-    }
-  }
-
-  DEVICE_INLINE void SharedData32Rows4_2(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
-    uint32_t row_id = (stride ? threadIdx.x >> 4 : threadIdx.x & 0x3) * 2;
-
-    UNROLL
-    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 4; i++) {
-        if (store) {
-          shmem[ntt_id * 32 + row_id * 4 + 4 * j + i] = X[4 * j + i];
-        } else {
-          X[4 * j + i] = shmem[ntt_id * 32 + row_id * 4 + 4 * j + i];
-        }
-      }
-    }
-  }
-
-  DEVICE_INLINE void SharedData16Columns8(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
-    uint32_t column_id = stride ? threadIdx.x >> 5 : threadIdx.x & 0x1;
-
-    UNROLL
-    for (uint32_t i = 0; i < 8; i++) {
-      if (store) {
-        shmem[ntt_id * 16 + i * 2 + column_id] = X[i];
-      } else {
-        X[i] = shmem[ntt_id * 16 + i * 2 + column_id];
-      }
-    }
-  }
-
-  DEVICE_INLINE void SharedData16Rows8(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
-    uint32_t row_id = stride ? threadIdx.x >> 5 : threadIdx.x & 0x1;
-
-    UNROLL
-    for (uint32_t i = 0; i < 8; i++) {
-      if (store) {
-        shmem[ntt_id * 16 + row_id * 8 + i] = X[i];
-      } else {
-        X[i] = shmem[ntt_id * 16 + row_id * 8 + i];
-      }
-    }
-  }
-
-  DEVICE_INLINE void SharedData16Columns2_4(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
-    uint32_t column_id = (stride ? threadIdx.x >> 5 : threadIdx.x & 0x1) * 4;
-
-    UNROLL
-    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 2; i++) {
-        if (store) {
-          shmem[ntt_id * 16 + i * 8 + column_id + j] = X[2 * j + i];
-        } else {
-          X[2 * j + i] = shmem[ntt_id * 16 + i * 8 + column_id + j];
-        }
-      }
-    }
-  }
-
-  DEVICE_INLINE void SharedData16Rows2_4(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
-    uint32_t row_id = (stride ? threadIdx.x >> 5 : threadIdx.x & 0x1) * 4;
-
-    UNROLL
-    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 2; i++) {
-        if (store) {
-          shmem[ntt_id * 16 + row_id * 2 + 2 * j + i] = X[2 * j + i];
-        } else {
-          X[2 * j + i] = shmem[ntt_id * 16 + row_id * 2 + 2 * j + i];
-        }
-      }
-    }
-  }
-
-  DEVICE_INLINE void twiddlesInternal()
-  {
-    UNROLL
-    for (int i = 1; i < 8; i++) {
-      X[i] = X[i] * WI[i - 1];
-    }
-  }
-
-  DEVICE_INLINE void twiddlesExternal()
-  {
-    UNROLL
-    for (int i = 0; i < 8; i++) {
-      X[i] = X[i] * WE[i];
-    }
-  }
-};
-
-#endif
--- a/icicle/src/polynomials/CMakeLists.txt
+++ b/icicle/src/polynomials/CMakeLists.txt
@@ -0,0 +1,27 @@
+set(TARGET icicle_poly)
+set(CURVE_TARGET icicle_curve)
+set(FIELD_TARGET icicle_field)
+
+set(SRC ../)
+
+set(POLY_SOURCE ${SRC}/polynomials/polynomials.cpp)
+set(POLY_API_SOURCE ${SRC}/polynomials/polynomials_c_api.cpp)
+# if(MSM)
+#   list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
+#   if(G2)
+#     list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
+#   endif()
+# endif()
+# if(ECNTT)
+#   list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
+#   list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
+# endif()
+
+add_library(${TARGET} STATIC ${POLY_SOURCE})
+add_library(${TARGET} STATIC ${POLY_API_SOURCE})
+target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
+target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
+target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
+target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
+target_compile_features(${TARGET} PUBLIC cxx_std_17)
--- a/icicle/src/polynomials/cuda_backend/kernels.cuh
+++ b/icicle/src/polynomials/cuda_backend/kernels.cuh
@@ -39,7 +39,7 @@ namespace polynomials {

  /*============================== evaluate ==============================*/
  template <typename T>
-  __device__ T pow(T base, int exp)
+  T pow(T base, int exp)
  {
    T result = T::one();
    while (exp > 0) {
--- a/icicle/src/polynomials/polynomials.cpp
+++ b/icicle/src/polynomials/polynomials.cpp
@@ -0,0 +1,204 @@
+#define FIELD_ID BN254
+#include "../../include/polynomials/polynomials.h"
+namespace polynomials {
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I>::Polynomial()
+  {
+    if (nullptr == s_factory) {
+      throw std::runtime_error("Polynomial factory not initialized. Must call Polynomial::initialize(factory)");
+    }
+    m_context = s_factory->create_context();
+    m_backend = s_factory->create_backend();
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I> Polynomial<C, D, I>::from_coefficients(const C* coefficients, uint64_t nof_coefficients)
+  {
+    Polynomial<C, D, I> P = {};
+    P.m_backend->from_coefficients(P.m_context, nof_coefficients, coefficients);
+    return P;
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I> Polynomial<C, D, I>::from_rou_evaluations(const I* evaluations, uint64_t nof_evaluations)
+  {
+    Polynomial<C, D, I> P = {};
+    P.m_backend->from_rou_evaluations(P.m_context, nof_evaluations, evaluations);
+    return P;
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I> Polynomial<C, D, I>::clone() const
+  {
+    Polynomial<C, D, I> P = {};
+    m_backend->clone(P.m_context, m_context);
+    return P;
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I> Polynomial<C, D, I>::slice(uint64_t offset, uint64_t stride, uint64_t size)
+  {
+    Polynomial res = {};
+    m_backend->slice(res.m_context, this->m_context, offset, stride, size);
+    return res;
+  }
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I> Polynomial<C, D, I>::even()
+  {
+    return slice(0, 2, 0 /*all elements*/);
+  }
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I> Polynomial<C, D, I>::odd()
+  {
+    return slice(1, 2, 0 /*all elements*/);
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I> Polynomial<C, D, I>::operator+(const Polynomial<C, D, I>& rhs) const
+  {
+    Polynomial<C, D, I> res = {};
+    m_backend->add(res.m_context, m_context, rhs.m_context);
+    return res;
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I> Polynomial<C, D, I>::operator-(const Polynomial<C, D, I>& rhs) const
+  {
+    Polynomial<C, D, I> res = {};
+    m_backend->subtract(res.m_context, m_context, rhs.m_context);
+    return res;
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I> Polynomial<C, D, I>::operator*(const Polynomial& rhs) const
+  {
+    Polynomial<C, D, I> res = {};
+    m_backend->multiply(res.m_context, m_context, rhs.m_context);
+    return res;
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I> Polynomial<C, D, I>::operator*(const D& scalar) const
+  {
+    Polynomial<C, D, I> res = {};
+    m_backend->multiply(res.m_context, m_context, scalar);
+    return res;
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I> operator*(const D& scalar, const Polynomial<C, D, I>& rhs)
+  {
+    return rhs * scalar;
+  }
+
+  template <typename C, typename D, typename I>
+  std::pair<Polynomial<C, D, I>, Polynomial<C, D, I>> Polynomial<C, D, I>::divide(const Polynomial<C, D, I>& rhs) const
+  {
+    Polynomial<C, D, I> Q = {}, R = {};
+    m_backend->divide(Q.m_context, R.m_context, m_context, rhs.m_context);
+    return std::make_pair(std::move(Q), std::move(R));
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I> Polynomial<C, D, I>::operator/(const Polynomial& rhs) const
+  {
+    Polynomial<C, D, I> res = {};
+    m_backend->quotient(res.m_context, m_context, rhs.m_context);
+    return res;
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I> Polynomial<C, D, I>::operator%(const Polynomial& rhs) const
+  {
+    Polynomial<C, D, I> res = {};
+    m_backend->remainder(res.m_context, m_context, rhs.m_context);
+    return res;
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I> Polynomial<C, D, I>::divide_by_vanishing_polynomial(uint64_t vanishing_polynomial_degree) const
+  {
+    Polynomial<C, D, I> res = {};
+    m_backend->divide_by_vanishing_polynomial(res.m_context, m_context, vanishing_polynomial_degree);
+    return res;
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I>& Polynomial<C, D, I>::operator+=(const Polynomial& rhs)
+  {
+    m_backend->add(m_context, m_context, rhs.m_context);
+    return *this;
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I>& Polynomial<C, D, I>::add_monomial_inplace(C monomial_coeff, uint64_t monomial)
+  {
+    m_backend->add_monomial_inplace(m_context, monomial_coeff, monomial);
+    return *this;
+  }
+
+  template <typename C, typename D, typename I>
+  Polynomial<C, D, I>& Polynomial<C, D, I>::sub_monomial_inplace(C monomial_coeff, uint64_t monomial)
+  {
+    m_backend->sub_monomial_inplace(m_context, monomial_coeff, monomial);
+    return *this;
+  }
+
+  template <typename C, typename D, typename I>
+  I Polynomial<C, D, I>::operator()(const D& x) const
+  {
+    I eval = {};
+    evaluate(&x, &eval);
+    return eval;
+  }
+
+  template <typename C, typename D, typename I>
+  void Polynomial<C, D, I>::evaluate(const D* x, I* eval) const
+  {
+    m_backend->evaluate(m_context, x, eval);
+  }
+
+  template <typename C, typename D, typename I>
+  void Polynomial<C, D, I>::evaluate_on_domain(D* domain, uint64_t size, I* evals /*OUT*/) const
+  {
+    return m_backend->evaluate_on_domain(m_context, domain, size, evals);
+  }
+
+  template <typename C, typename D, typename I>
+  int64_t Polynomial<C, D, I>::degree()
+  {
+    return m_backend->degree(m_context);
+  }
+
+  template <typename C, typename D, typename I>
+  C Polynomial<C, D, I>::get_coeff(uint64_t idx) const
+  {
+    return m_backend->get_coeff(m_context, idx);
+  }
+
+  template <typename C, typename D, typename I>
+  uint64_t Polynomial<C, D, I>::copy_coeffs(C* host_coeffs, uint64_t start_idx, uint64_t end_idx) const
+  {
+    return m_backend->copy_coeffs(m_context, host_coeffs, start_idx, end_idx);
+  }
+
+  template <typename C, typename D, typename I>
+  std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/>
+  Polynomial<C, D, I>::get_coefficients_view()
+  {
+    return m_backend->get_coefficients_view(m_context);
+  }
+
+  template <typename C, typename D, typename I>
+  std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
+  Polynomial<C, D, I>::get_rou_evaluations_view(uint64_t nof_evaluations, bool is_reversed)
+  {
+    return m_backend->get_rou_evaluations_view(m_context, nof_evaluations, is_reversed);
+  }
+
+  // explicit instantiation for default type (scalar field)
+  template class Polynomial<scalar_t>;
+  template Polynomial<scalar_t> operator*(const scalar_t& c, const Polynomial<scalar_t>& rhs);
+
+} // namespace polynomials
--- a/icicle/src/polynomials/polynomials_c_api.cpp
+++ b/icicle/src/polynomials/polynomials_c_api.cpp
@@ -0,0 +1,284 @@
+#define FIELD_ID BN254
+#include "../../include/polynomials/polynomials.h"
+#include "../../include/fields/field_config.cuh"
+#include "../../include/utils/utils.h"
+#include "../../include/utils/integrity_pointer.h"
+#include "../../include/polynomials/cuda_backend/polynomial_cuda_backend.cuh"
+
+namespace polynomials {
+  extern "C" {
+
+  // Defines a polynomial instance based on the scalar type from the FIELD configuration.
+  typedef Polynomial<scalar_t> PolynomialInst;
+
+  bool CONCAT_EXPAND(FIELD, polynomial_init_cuda_backend)()
+  {
+    static auto cuda_factory = std::make_shared<CUDAPolynomialFactory<scalar_t>>();
+    PolynomialInst::initialize(cuda_factory);
+    return cuda_factory != nullptr;
+  }
+
+  // Constructs a polynomial from a set of coefficients.
+  // coeffs: Array of coefficients.
+  // size: Number of coefficients in the array.
+  // Returns a pointer to the newly created polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_create_from_coefficients)(scalar_t* coeffs, size_t size)
+  {
+    auto result = new PolynomialInst(PolynomialInst::from_coefficients(coeffs, size));
+    return result;
+  }
+
+  // Constructs a polynomial from evaluations at the roots of unity.
+  // evals: Array of evaluations.
+  // size: Number of evaluations in the array.
+  // Returns a pointer to the newly created polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_create_from_rou_evaluations)(scalar_t* evals, size_t size)
+  {
+    auto result = new PolynomialInst(PolynomialInst::from_rou_evaluations(evals, size));
+    return result;
+  }
+
+  // Clones an existing polynomial instance.
+  // p: Pointer to the polynomial instance to clone.
+  // Returns a pointer to the cloned polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_clone)(const PolynomialInst* p)
+  {
+    auto result = new PolynomialInst(p->clone());
+    return result;
+  }
+
+  // Deletes a polynomial instance, freeing its memory.
+  // instance: Pointer to the polynomial instance to delete.
+  void CONCAT_EXPAND(FIELD, polynomial_delete)(PolynomialInst* instance) { delete instance; }
+
+  // Prints a polynomial to stdout
+  void CONCAT_EXPAND(FIELD, polynomial_print(PolynomialInst* p)) { std::cout << *p << std::endl; }
+
+  // Adds two polynomials.
+  // a, b: Pointers to the polynomial instances to add.
+  // Returns a pointer to the resulting polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_add)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a + *b));
+    return result;
+  }
+
+  // Adds a polynomial to another in place.
+  // a: Pointer to the polynomial to add to.
+  // b: Pointer to the polynomial to add.
+  void CONCAT_EXPAND(FIELD, polynomial_add_inplace)(PolynomialInst* a, const PolynomialInst* b) { *a += *b; }
+
+  // Subtracts one polynomial from another.
+  // a, b: Pointers to the polynomial instances (minuend and subtrahend, respectively).
+  // Returns a pointer to the resulting polynomial instance.
+
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_subtract)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a - *b));
+    return result;
+  }
+
+  // Multiplies two polynomials.
+  // a, b: Pointers to the polynomial instances to multiply.
+  // Returns a pointer to the resulting polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_multiply)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a * *b));
+    return result;
+  }
+
+  // Multiplies a polynomial by scalar.
+  // a: Pointer to the polynomial instance.
+  // scalar: Scalar to multiply by.
+  // Returns a pointer to the resulting polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_multiply_by_scalar)(const PolynomialInst* a, const scalar_t& scalar)
+  {
+    auto result = new PolynomialInst(std::move(*a * scalar));
+    return result;
+  }
+
+  // Divides one polynomial by another, returning both quotient and remainder.
+  // a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
+  // q: Output parameter for the quotient.
+  // r: Output parameter for the remainder.
+  void CONCAT_EXPAND(FIELD, polynomial_division)(
+    const PolynomialInst* a, const PolynomialInst* b, PolynomialInst** q /*OUT*/, PolynomialInst** r /*OUT*/)
+  {
+    auto [_q, _r] = a->divide(*b);
+    *q = new PolynomialInst(std::move(_q));
+    *r = new PolynomialInst(std::move(_r));
+  }
+
+  // Calculates the quotient of dividing one polynomial by another.
+  // a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
+  // Returns a pointer to the resulting quotient polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_quotient)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a / *b));
+    return result;
+  }
+
+  // Calculates the remainder of dividing one polynomial by another.
+  // a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
+  // Returns a pointer to the resulting remainder polynomial instance.
+
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_remainder)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a % *b));
+    return result;
+  }
+
+  // Divides a polynomial by a vanishing polynomial of a given degree, over rou domain.
+  // p: Pointer to the polynomial instance.
+  // vanishing_poly_degree: Degree of the vanishing polynomial.
+  // Returns a pointer to the resulting polynomial instance.
+  PolynomialInst*
+  CONCAT_EXPAND(FIELD, polynomial_divide_by_vanishing)(const PolynomialInst* p, uint64_t vanishing_poly_degree)
+  {
+    auto result = new PolynomialInst(std::move(p->divide_by_vanishing_polynomial(vanishing_poly_degree)));
+    return result;
+  }
+
+  // Adds a monomial to a polynomial in place.
+  // p: Pointer to the polynomial instance.
+  // monomial_coeff: Coefficient of the monomial to add.
+  // monomial: Degree of the monomial to add.
+  void CONCAT_EXPAND(FIELD, polynomial_add_monomial_inplace)(
+    PolynomialInst* p, const scalar_t& monomial_coeff, uint64_t monomial)
+  {
+    p->add_monomial_inplace(monomial_coeff, monomial);
+  }
+
+  // Subtracts a monomial from a polynomial in place.
+  // p: Pointer to the polynomial instance.
+  // monomial_coeff: Coefficient of the monomial to subtract.
+  // monomial: Degree of the monomial to subtract.
+  void CONCAT_EXPAND(FIELD, polynomial_sub_monomial_inplace)(
+    PolynomialInst* p, const scalar_t& monomial_coeff, uint64_t monomial)
+  {
+    p->sub_monomial_inplace(monomial_coeff, monomial);
+  }
+
+  // Creates a new polynomial instance by slicing an existing polynomial.
+  // p: Pointer to the original polynomial instance to be sliced.
+  // offset: Starting index for the slice.
+  // stride: Interval between elements in the slice.
+  // size: Number of elements in the slice.
+  // Returns: Pointer to the new polynomial instance containing the slice.
+  PolynomialInst*
+  CONCAT_EXPAND(FIELD, polynomial_slice)(PolynomialInst* p, uint64_t offset, uint64_t stride, uint64_t size)
+  {
+    auto result = new PolynomialInst(std::move(p->slice(offset, stride, size)));
+    return result;
+  }
+
+  // Creates a new polynomial instance containing only the even-powered terms of the original polynomial.
+  // p: Pointer to the original polynomial instance.
+  // Returns: Pointer to the new polynomial instance containing only even-powered terms.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_even)(PolynomialInst* p)
+  {
+    auto result = new PolynomialInst(std::move(p->even()));
+    return result;
+  }
+
+  // Creates a new polynomial instance containing only the odd-powered terms of the original polynomial.
+  // p: Pointer to the original polynomial instance.
+  // Returns: Pointer to the new polynomial instance containing only odd-powered terms.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_odd)(PolynomialInst* p)
+  {
+    auto result = new PolynomialInst(std::move(p->odd()));
+    return result;
+  }
+
+  // Evaluates a polynomial on a domain of points.
+  // p: Pointer to the polynomial instance.
+  // domain: Array of points constituting the domain.
+  // domain_size: Number of points in the domain.
+  // evals: Output array for the evaluations.
+  void CONCAT_EXPAND(FIELD, polynomial_evaluate_on_domain)(
+    const PolynomialInst* p, scalar_t* domain, uint64_t domain_size, scalar_t* evals /*OUT*/)
+  {
+    return p->evaluate_on_domain(domain, domain_size, evals);
+  }
+
+  // Returns the degree of a polynomial.
+  // p: Pointer to the polynomial instance.
+  // Returns the degree of the polynomial.
+  int64_t CONCAT_EXPAND(FIELD, polynomial_degree)(PolynomialInst* p) { return p->degree(); }
+
+  // Copies a range of polynomial coefficients to host/device memory.
+  // p: Pointer to the polynomial instance.
+  // host_memory: Array to copy the coefficients into. If NULL, not copying.
+  // start_idx: Start index of the range to copy.
+  // end_idx: End index of the range to copy.
+  // Returns the number of coefficients copied. if memory is NULL, returns number of coefficients.
+  uint64_t CONCAT_EXPAND(FIELD, polynomial_copy_coeffs_range)(
+    PolynomialInst* p, scalar_t* memory, uint64_t start_idx, uint64_t end_idx)
+  {
+    return p->copy_coeffs(memory, start_idx, end_idx);
+  }
+
+  // Retrieves a device-memory raw-ptr of the polynomial coefficients.
+  // p: Pointer to the polynomial instance.
+  // size: Output parameter for the size of the view.
+  // device_id: Output parameter for the device ID.
+  // Returns a raw mutable pointer to the coefficients.
+  scalar_t* CONCAT_EXPAND(FIELD, polynomial_get_coeffs_raw_ptr)(
+    PolynomialInst* p, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
+  {
+    auto [coeffs, _size, _device_id] = p->get_coefficients_view();
+    *size = _size;
+    *device_id = _device_id;
+    return const_cast<scalar_t*>(coeffs.get());
+  }
+
+  // Retrieves a device-memory view of the polynomial coefficients.
+  // p: Pointer to the polynomial instance.
+  // size: Output parameter for the size of the view.
+  // device_id: Output parameter for the device ID.
+  // Returns a pointer to an integrity pointer encapsulating the coefficients view.
+  IntegrityPointer<scalar_t>* CONCAT_EXPAND(FIELD, polynomial_get_coeff_view)(
+    PolynomialInst* p, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
+  {
+    auto [coeffs, _size, _device_id] = p->get_coefficients_view();
+    *size = _size;
+    *device_id = _device_id;
+    return new IntegrityPointer<scalar_t>(std::move(coeffs));
+  }
+
+  // Retrieves a device-memory view of the polynomial's evaluations on the roots of unity.
+  // p: Pointer to the polynomial instance.
+  // nof_evals: Number of evaluations.
+  // is_reversed: Whether the evaluations are in reversed order.
+  // size: Output parameter for the size of the view.
+  // device_id: Output parameter for the device ID.
+  // Returns a pointer to an integrity pointer encapsulating the evaluations view.
+  IntegrityPointer<scalar_t>* CONCAT_EXPAND(FIELD, polynomial_get_rou_evaluations_view)(
+    PolynomialInst* p, uint64_t nof_evals, bool is_reversed, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
+  {
+    auto [rou_evals, _size, _device_id] = p->get_rou_evaluations_view(nof_evals, is_reversed);
+    *size = _size;
+    *device_id = _device_id;
+    return new IntegrityPointer<scalar_t>(std::move(rou_evals));
+  }
+
+  // Reads the pointer from an integrity pointer.
+  // p: Pointer to the integrity pointer.
+  // Returns the raw pointer if still valid, otherwise NULL.
+  const scalar_t* CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_get)(IntegrityPointer<scalar_t>* p)
+  {
+    return p->get();
+  }
+
+  // Checks if an integrity pointer is still valid.
+  // p: Pointer to the integrity pointer.
+  // Returns true if the pointer is valid, false otherwise.
+  bool CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_is_valid)(IntegrityPointer<scalar_t>* p) { return p->isValid(); }
+
+  // Destroys an integrity pointer, freeing its resources.
+  // p: Pointer to the integrity pointer to destroy.
+  void CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_destroy)(IntegrityPointer<scalar_t>* p) { delete p; }
+
+  } // extern "C"
+
+} // namespace polynomials
--- a/icicle/src/poseidon/CMakeLists.txt
+++ b/icicle/src/poseidon/CMakeLists.txt
@@ -0,0 +1,25 @@
+set(TARGET icicle_poseidon)
+set(CURVE_TARGET icicle_curve)
+set(FIELD_TARGET icicle_field)
+
+set(SRC ../)
+
+set(POLY_SOURCE ${SRC}/poseidon/poseidon.cpp)
+# if(MSM)
+#   list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
+#   if(G2)
+#     list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
+#   endif()
+# endif()
+# if(ECNTT)
+#   list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
+#   list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
+# endif()
+
+add_library(${TARGET} STATIC ${POLY_SOURCE})
+target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
+target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
+target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
+target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
+target_compile_features(${TARGET} PUBLIC cxx_std_17)
--- a/icicle/src/poseidon/constants.cu
+++ b/icicle/src/poseidon/constants.cu
@@ -1,21 +1,21 @@
-#include "poseidon/poseidon.cuh"
+#include "../../include/poseidon/poseidon.cuh"

 /// These are pre-calculated constants for different curves
-#include "fields/id.h"
+#include "../../include/fields/id.h"
 #if FIELD_ID == BN254
-#include "poseidon/constants/bn254_poseidon.h"
+#include "../../include/poseidon/constants/bn254_poseidon.h"
 using namespace poseidon_constants_bn254;
 #elif FIELD_ID == BLS12_381
-#include "poseidon/constants/bls12_381_poseidon.h"
+#include "../../include/poseidon/constants/bls12_381_poseidon.h"
 using namespace poseidon_constants_bls12_381;
 #elif FIELD_ID == BLS12_377
-#include "poseidon/constants/bls12_377_poseidon.h"
+#include "../../include/poseidon/constants/bls12_377_poseidon.h"
 using namespace poseidon_constants_bls12_377;
 #elif FIELD_ID == BW6_761
-#include "poseidon/constants/bw6_761_poseidon.h"
+#include "../../include/poseidon/constants/bw6_761_poseidon.h"
 using namespace poseidon_constants_bw6_761;
 #elif FIELD_ID == GRUMPKIN
-#include "poseidon/constants/grumpkin_poseidon.h"
+#include "../../include/poseidon/constants/grumpkin_poseidon.h"
 using namespace poseidon_constants_grumpkin;
 #endif

@@ -29,8 +29,8 @@ namespace poseidon {
    device_context::DeviceContext& ctx,
    PoseidonConstants<S>* poseidon_constants)
  {
-    CHK_INIT_IF_RETURN();
-    cudaStream_t& stream = ctx.stream;
+    // CHK_INIT_IF_RETURN();
+    int& stream = ctx.stream;
    int width = arity + 1;
    int round_constants_len = width * full_rounds_half * 2 + partial_rounds;
    int mds_matrix_len = width * width;
@@ -39,10 +39,10 @@ namespace poseidon {

    // Malloc memory for copying constants
    S* d_constants;
-    CHK_IF_RETURN(cudaMallocAsync(&d_constants, sizeof(S) * constants_len, stream));
+    // CHK_IF_RETURN(cudaMallocAsync(&d_constants, sizeof(S) * constants_len, stream));

    // Copy constants
-    CHK_IF_RETURN(cudaMemcpyAsync(d_constants, constants, sizeof(S) * constants_len, cudaMemcpyHostToDevice, stream));
+    // CHK_IF_RETURN(cudaMemcpyAsync(d_constants, constants, sizeof(S) * constants_len, cudaMemcpyHostToDevice, stream));

    S* round_constants = d_constants;
    S* mds_matrix = round_constants + round_constants_len;
@@ -56,18 +56,18 @@ namespace poseidon {
    S domain_tag = S::from(tree_domain_tag_value);

    // Make sure all the constants have been copied
-    CHK_IF_RETURN(cudaStreamSynchronize(stream));
+    // CHK_IF_RETURN(cudaStreamSynchronize(stream));
    *poseidon_constants = {arity,      partial_rounds,    full_rounds_half, round_constants,
                           mds_matrix, non_sparse_matrix, sparse_matrices,  domain_tag};

-    return CHK_LAST();
+    return 0;
  }

  template <typename S>
  cudaError_t init_optimized_poseidon_constants(
    int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* poseidon_constants)
  {
-    CHK_INIT_IF_RETURN();
+    //CHK_INIT_IF_RETURN();
    int full_rounds_half = FULL_ROUNDS_DEFAULT;
    int partial_rounds;
    unsigned char* constants;
@@ -96,7 +96,7 @@ namespace poseidon {

    create_optimized_poseidon_constants(arity, full_rounds_half, partial_rounds, h_constants, ctx, poseidon_constants);

-    return CHK_LAST();
+    return 0; //CHK_LAST();
  }

  extern "C" cudaError_t CONCAT_EXPAND(FIELD, create_optimized_poseidon_constants_cuda)(
--- a/icicle/src/poseidon/kernels.cu
+++ b/icicle/src/poseidon/kernels.cu
@@ -1,38 +1,15 @@
-#include "poseidon/poseidon.cuh"
-#include "gpu-utils/modifiers.cuh"
+#include "../../include/poseidon/poseidon.cuh"
+#include "../../include/gpu-utils/modifiers.cuh"

 namespace poseidon {
  template <typename S, int T>
-  __global__ void prepare_poseidon_states(S* states, size_t number_of_states, S domain_tag, bool aligned)
+  void prepare_poseidon_states(S* states, size_t number_of_states, S domain_tag, bool aligned)
  {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int state_number = idx / T;
-    if (state_number >= number_of_states) { return; }
-    int element_number = idx % T;
-
-    S prepared_element;
-
-    // Domain separation
-    if (element_number == 0) {
-      prepared_element = domain_tag;
-    } else {
-      if (aligned) {
-        prepared_element = states[idx];
-      } else {
-        prepared_element = states[idx - 1];
-      }
-    }
-
-    // We need __syncthreads here if the state is not aligned
-    // because then we need to shift the vector [A, B, 0] -> [D, A, B]
-    if (!aligned) { __syncthreads(); }
-
-    // Store element in state
-    states[idx] = prepared_element;
+    return;
  }

  template <typename S>
-  DEVICE_INLINE S sbox_alpha_five(S element)
+  S sbox_alpha_five(S element)
  {
    S result = S::sqr(element);
    result = S::sqr(result);
@@ -40,23 +17,13 @@ namespace poseidon {
  }

  template <typename S, int T>
-  __device__ S vecs_mul_matrix(S element, S* matrix, int element_number, int vec_number, S* shared_states)
+  S vecs_mul_matrix(S element, S* matrix, int element_number, int vec_number, S* shared_states)
  {
-    __syncthreads();
-    shared_states[threadIdx.x] = element;
-    __syncthreads();
-
-    typename S::Wide element_wide = S::mul_wide(shared_states[vec_number * T], matrix[element_number]);
-    UNROLL
-    for (int i = 1; i < T; i++) {
-      element_wide = element_wide + S::mul_wide(shared_states[vec_number * T + i], matrix[i * T + element_number]);
-    }
-
-    return S::reduce(element_wide);
+    return element;
  }

  template <typename S, int T>
-  __device__ S full_round(
+  S full_round(
    S element,
    size_t rc_offset,
    int local_state_number,
@@ -67,110 +34,40 @@ namespace poseidon {
    S* shared_states,
    const PoseidonConstants<S>& constants)
  {
-    if (add_pre_round_constants) {
-      element = element + constants.round_constants[rc_offset + element_number];
-      rc_offset += T;
-    }
-    element = sbox_alpha_five(element);
-    if (!skip_rc) { element = element + constants.round_constants[rc_offset + element_number]; }
-
-    // Multiply all the states by mds matrix
-    S* matrix = multiply_by_mds ? constants.mds_matrix : constants.non_sparse_matrix;
-    return vecs_mul_matrix<S, T>(element, matrix, element_number, local_state_number, shared_states);
+    return element;
  }

  template <typename S, int T>
-  __global__ void full_rounds(
+    void full_rounds(
    S* states, size_t number_of_states, size_t rc_offset, bool first_half, const PoseidonConstants<S> constants)
  {
-    extern __shared__ S shared_states[];
-
-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int state_number = idx / T;
-    if (state_number >= number_of_states) { return; }
-    int local_state_number = threadIdx.x / T;
-    int element_number = idx % T;
-
-    S new_el = states[idx];
-    bool add_pre_round_constants = first_half;
-    for (int i = 0; i < constants.full_rounds_half; i++) {
-      new_el = full_round<S, T>(
-        new_el, rc_offset, local_state_number, element_number, !first_half || (i < (constants.full_rounds_half - 1)),
-        add_pre_round_constants, !first_half && (i == constants.full_rounds_half - 1), shared_states, constants);
-      rc_offset += T;
-
-      if (add_pre_round_constants) {
-        rc_offset += T;
-        add_pre_round_constants = false;
-      }
-    }
-    states[idx] = new_el;
+    return;
  }

  template <typename S, int T>
-  __device__ S partial_round(S state[T], size_t rc_offset, int round_number, const PoseidonConstants<S>& constants)
+  S partial_round(S state[T], size_t rc_offset, int round_number, const PoseidonConstants<S>& constants)
  {
    S element = state[0];
-    element = sbox_alpha_five(element);
-    element = element + constants.round_constants[rc_offset];
-
-    S* sparse_matrix = &constants.sparse_matrices[(T * 2 - 1) * round_number];
-
-    typename S::Wide state_0_wide = S::mul_wide(element, sparse_matrix[0]);
-
-    UNROLL
-    for (int i = 1; i < T; i++) {
-      state_0_wide = state_0_wide + S::mul_wide(state[i], sparse_matrix[i]);
-    }
-
-    state[0] = S::reduce(state_0_wide);
-
-    UNROLL
-    for (int i = 1; i < T; i++) {
-      state[i] = state[i] + (element * sparse_matrix[T + i - 1]);
-    }
+    return element;
  }

  template <typename S, int T>
-  __global__ void
+  void
  partial_rounds(S* states, size_t number_of_states, size_t rc_offset, const PoseidonConstants<S> constants)
  {
-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (idx >= number_of_states) { return; }
-
-    S state[T];
-    UNROLL
-    for (int i = 0; i < T; i++) {
-      state[i] = states[idx * T + i];
-    }
-
-    for (int i = 0; i < constants.partial_rounds; i++) {
-      partial_round<S, T>(state, rc_offset, i, constants);
-      rc_offset++;
-    }
-
-    UNROLL
-    for (int i = 0; i < T; i++) {
-      states[idx * T + i] = state[i];
-    }
+    return;
  }

  // These function is just doing copy from the states to the output
  template <typename S, int T>
-  __global__ void get_hash_results(S* states, size_t number_of_states, S* out)
+  void get_hash_results(S* states, size_t number_of_states, S* out)
  {
-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (idx >= number_of_states) { return; }
-
-    out[idx] = states[idx * T + 1];
+    return;
  }

  template <typename S, int T>
-  __global__ void copy_recursive(S* state, size_t number_of_states, S* out)
+  void copy_recursive(S* state, size_t number_of_states, S* out)
  {
-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (idx >= number_of_states) { return; }
-
-    state[(idx / (T - 1) * T) + (idx % (T - 1)) + 1] = out[idx];
+    return;
  }
 } // namespace poseidon
--- a/icicle/src/poseidon/poseidon.cpp
+++ b/icicle/src/poseidon/poseidon.cpp
@@ -0,0 +1,47 @@
+#define FIELD_ID BN254
+#include "../../include/fields/field_config.cuh"
+
+using namespace field_config;
+
+#include "../../include/poseidon/poseidon.cuh"
+#include "constants.cu"
+#include "kernels.cu"
+typedef int cudaError_t;
+namespace poseidon {
+  template <typename S, int T>
+  cudaError_t
+  permute_many(S* states, size_t number_of_states, const PoseidonConstants<S>& constants, int& stream)
+  {
+    return 0;
+  }
+
+  template <typename S, int T>
+  cudaError_t poseidon_hash(
+    S* input, S* output, size_t number_of_states, const PoseidonConstants<S>& constants, const PoseidonConfig& config)
+  {
+    return 0;
+  }
+
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_hash_cuda)(
+    scalar_t* input,
+    scalar_t* output,
+    int number_of_states,
+    int arity,
+    const PoseidonConstants<scalar_t>& constants,
+    PoseidonConfig& config)
+  {
+    switch (arity) {
+    case 2:
+      return poseidon_hash<scalar_t, 3>(input, output, number_of_states, constants, config);
+    case 4:
+      return poseidon_hash<scalar_t, 5>(input, output, number_of_states, constants, config);
+    case 8:
+      return poseidon_hash<scalar_t, 9>(input, output, number_of_states, constants, config);
+    case 11:
+      return poseidon_hash<scalar_t, 12>(input, output, number_of_states, constants, config);
+    default:
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "PoseidonHash: #arity must be one of [2, 4, 8, 11]");
+    }
+    return 0;
+  }
+} // namespace poseidon
--- a/icicle/src/vec_ops/CMakeLists.txt
+++ b/icicle/src/vec_ops/CMakeLists.txt
@@ -0,0 +1,27 @@
+set(TARGET icicle_vec_ops)
+set(CURVE_TARGET icicle_curve)
+set(FIELD_TARGET icicle_field)
+
+set(SRC ../)
+
+set(VEC_OPS_SOURCE ${SRC}/vec_ops/extern.cpp)
+set(VEC_OPS_EXT_SOURCE ${SRC}/vec_ops/extern_extension.cpp)
+# if(MSM)
+#   list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
+#   if(G2)
+#     list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
+#   endif()
+# endif()
+# if(ECNTT)
+#   list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
+#   list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
+# endif()
+
+add_library(${TARGET} STATIC ${VEC_OPS_SOURCE})
+add_library(${TARGET} STATIC ${VEC_OPS_EXT_SOURCE})
+target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
+target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
+target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
+target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
+target_compile_features(${TARGET} PUBLIC cxx_std_17)
--- a/icicle/src/vec_ops/extern.cpp
+++ b/icicle/src/vec_ops/extern.cpp
@@ -0,0 +1,63 @@
+#define FIELD_ID BN254
+#include "../../include/fields/field_config.cuh"
+
+using namespace field_config;
+
+#include "../../include/utils/utils.h"
+#include "vec_ops.cu"
+
+namespace vec_ops {
+  /**
+   * Extern version of [Mul](@ref Mul) function with the template parameters
+   * `S` and `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
+   * or standalone "STARK field" given by `-DFIELD`).
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t
+  CONCAT_EXPAND(FIELD, mul_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
+  {
+    return mul<scalar_t>(vec_a, vec_b, n, config, result);
+  }
+
+  /**
+   * Extern version of [Add](@ref Add) function with the template parameter
+   * `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
+   * or standalone "STARK field" given by `-DFIELD`).
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t
+  CONCAT_EXPAND(FIELD, add_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
+  {
+    return add<scalar_t>(vec_a, vec_b, n, config, result);
+  }
+
+  /**
+   * Extern version of [Sub](@ref Sub) function with the template parameter
+   * `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
+   * or standalone "STARK field" given by `-DFIELD`).
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t
+  CONCAT_EXPAND(FIELD, sub_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
+  {
+    return sub<scalar_t>(vec_a, vec_b, n, config, result);
+  }
+
+  /**
+   * Extern version of transpose_batch function with the template parameter
+   * `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
+   * or standalone "STARK field" given by `-DFIELD`).
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, transpose_matrix_cuda)(
+    const scalar_t* input,
+    uint32_t row_size,
+    uint32_t column_size,
+    scalar_t* output,
+    device_context::DeviceContext& ctx,
+    bool on_device,
+    bool is_async)
+  {
+    return transpose_matrix<scalar_t>(input, output, row_size, column_size, ctx, on_device, is_async);
+  }
+} // namespace vec_ops
--- a/icicle/src/vec_ops/extern_extension.cpp
+++ b/icicle/src/vec_ops/extern_extension.cpp
@@ -0,0 +1,60 @@
+#define FIELD_ID BABY_BEAR
+#include "../../include/fields/field_config.cuh"
+
+using namespace field_config;
+
+#include "../../include/utils/utils.h"
+#include "vec_ops.cu"
+
+namespace vec_ops {
+  /**
+   * Extern version of [Mul](@ref Mul) function with the template parameters
+   * `S` and `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable
+   * during build.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_mul_cuda)(
+    extension_t* vec_a, extension_t* vec_b, int n, VecOpsConfig& config, extension_t* result)
+  {
+    return mul<extension_t>(vec_a, vec_b, n, config, result);
+  }
+
+  /**
+   * Extern version of [Add](@ref Add) function with the template parameter
+   * `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_add_cuda)(
+    extension_t* vec_a, extension_t* vec_b, int n, VecOpsConfig& config, extension_t* result)
+  {
+    return add<extension_t>(vec_a, vec_b, n, config, result);
+  }
+
+  /**
+   * Extern version of [Sub](@ref Sub) function with the template parameter
+   * `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_sub_cuda)(
+    extension_t* vec_a, extension_t* vec_b, int n, VecOpsConfig& config, extension_t* result)
+  {
+    return sub<extension_t>(vec_a, vec_b, n, config, result);
+  }
+
+  /**
+   * Extern version of transpose_batch function with the template parameter
+   * `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_transpose_matrix_cuda)(
+    const extension_t* input,
+    uint32_t row_size,
+    uint32_t column_size,
+    extension_t* output,
+    device_context::DeviceContext& ctx,
+    bool on_device,
+    bool is_async)
+  {
+    return transpose_matrix<extension_t>(input, output, row_size, column_size, ctx, on_device, is_async);
+  }
+} // namespace vec_ops
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -1,9 +1,8 @@
-#include <cuda.h>
 #include <stdexcept>

-#include "vec_ops/vec_ops.cuh"
-#include "gpu-utils/device_context.cuh"
-#include "utils/mont.cuh"
+#include "../../include/vec_ops/vec_ops.cuh"
+#include "../../include/gpu-utils/device_context.cuh"
+// #include "utils/mont.cuh"

 namespace vec_ops {

@@ -12,97 +11,47 @@ namespace vec_ops {
 #define MAX_THREADS_PER_BLOCK 256

    template <typename E>
-    __global__ void mul_kernel(const E* scalar_vec, const E* element_vec, int n, E* result)
+    void mul_kernel(const E* scalar_vec, const E* element_vec, int n, E* result)
    {
-      int tid = blockDim.x * blockIdx.x + threadIdx.x;
-      if (tid < n) { result[tid] = scalar_vec[tid] * element_vec[tid]; }
+      return;
    }

    template <typename E, typename S>
-    __global__ void mul_scalar_kernel(const E* element_vec, const S scalar, int n, E* result)
+    void mul_scalar_kernel(const E* element_vec, const S scalar, int n, E* result)
    {
-      int tid = blockIdx.x * blockDim.x + threadIdx.x;
-      if (tid < n) { result[tid] = element_vec[tid] * (scalar); }
+      return;
    }

    template <typename E>
-    __global__ void div_element_wise_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
+    void div_element_wise_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
    {
-      // TODO:implement better based on https://eprint.iacr.org/2008/199
-      int tid = blockIdx.x * blockDim.x + threadIdx.x;
-      if (tid < n) { result[tid] = element_vec1[tid] * E::inverse(element_vec2[tid]); }
+      return;
    }

    template <typename E>
-    __global__ void add_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
+    void add_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
    {
-      int tid = blockIdx.x * blockDim.x + threadIdx.x;
-      if (tid < n) { result[tid] = element_vec1[tid] + element_vec2[tid]; }
+      return;
    }

    template <typename E>
-    __global__ void sub_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
+    void sub_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
    {
-      int tid = blockIdx.x * blockDim.x + threadIdx.x;
-      if (tid < n) { result[tid] = element_vec1[tid] - element_vec2[tid]; }
+      return;
    }

    template <typename E>
-    __global__ void transpose_kernel(const E* in, E* out, uint32_t row_size, uint32_t column_size)
+    void transpose_kernel(const E* in, E* out, uint32_t row_size, uint32_t column_size)
    {
-      int tid = blockDim.x * blockIdx.x + threadIdx.x;
-      if (tid >= row_size * column_size) return;
-      out[(tid % row_size) * column_size + (tid / row_size)] = in[tid];
+      return;
    }
  } // namespace

+  typedef int cudaError_t;
  template <typename E, void (*Kernel)(const E*, const E*, int, E*)>
  cudaError_t vec_op(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
  {
-    CHK_INIT_IF_RETURN();
-
-    // Set the grid and block dimensions
-    int num_threads = MAX_THREADS_PER_BLOCK;
-    int num_blocks = (n + num_threads - 1) / num_threads;
-
-    E *d_result, *d_alloc_vec_a, *d_alloc_vec_b;
-    const E *d_vec_a, *d_vec_b;
-    if (!config.is_a_on_device) {
-      CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
-      CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
-      d_vec_a = d_alloc_vec_a;
-    } else {
-      d_vec_a = vec_a;
-    }
-
-    if (!config.is_b_on_device) {
-      CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
-      CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
-      d_vec_b = d_alloc_vec_b;
-    } else {
-      d_vec_b = vec_b;
-    }
-
-    if (!config.is_result_on_device) {
-      CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
-    } else {
-      d_result = result;
-    }
-
-    // Call the kernel to perform element-wise operation
-    Kernel<<<num_blocks, num_threads, 0, config.ctx.stream>>>(d_vec_a, d_vec_b, n, d_result);
-
-    if (!config.is_a_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
-    if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
-
-    if (!config.is_result_on_device) {
-      CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
-      CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream));
-    }
-
-    if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream));
-
-    return CHK_LAST();
+    return 0;
  }

  template <typename E>
@@ -133,35 +82,6 @@ namespace vec_ops {
    bool on_device,
    bool is_async)
  {
-    int number_of_threads = MAX_THREADS_PER_BLOCK;
-    int number_of_blocks = (row_size * column_size + number_of_threads - 1) / number_of_threads;
-    cudaStream_t stream = ctx.stream;
-
-    const E* d_mat_in;
-    E* d_allocated_input = nullptr;
-    E* d_mat_out;
-    if (!on_device) {
-      CHK_IF_RETURN(cudaMallocAsync(&d_allocated_input, row_size * column_size * sizeof(E), ctx.stream));
-      CHK_IF_RETURN(cudaMemcpyAsync(
-        d_allocated_input, mat_in, row_size * column_size * sizeof(E), cudaMemcpyHostToDevice, ctx.stream));
-
-      CHK_IF_RETURN(cudaMallocAsync(&d_mat_out, row_size * column_size * sizeof(E), ctx.stream));
-      d_mat_in = d_allocated_input;
-    } else {
-      d_mat_in = mat_in;
-      d_mat_out = mat_out;
-    }
-
-    transpose_kernel<<<number_of_blocks, number_of_threads, 0, stream>>>(d_mat_in, d_mat_out, row_size, column_size);
-
-    if (!on_device) {
-      CHK_IF_RETURN(
-        cudaMemcpyAsync(mat_out, d_mat_out, row_size * column_size * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream));
-      CHK_IF_RETURN(cudaFreeAsync(d_mat_out, ctx.stream));
-      CHK_IF_RETURN(cudaFreeAsync(d_allocated_input, ctx.stream));
-    }
-    if (!is_async) return CHK_STICKY(cudaStreamSynchronize(ctx.stream));
-
-    return CHK_LAST();
+    return 0;
  }
 } // namespace vec_ops
--- a/icicle/tests/polynomial_test.cu
+++ b/icicle/tests/polynomial_test.cu
@@ -514,20 +514,20 @@ namespace msm {
 class dummy_g2_t : public scalar_t
 {
 public:
-  static constexpr __host__ __device__ dummy_g2_t to_affine(const dummy_g2_t& point) { return point; }
+  static constexpr __host__ dummy_g2_t to_affine(const dummy_g2_t& point) { return point; }

-  static constexpr __host__ __device__ dummy_g2_t from_affine(const dummy_g2_t& point) { return point; }
+  static constexpr __host__ dummy_g2_t from_affine(const dummy_g2_t& point) { return point; }

-  static constexpr __host__ __device__ dummy_g2_t generator() { return dummy_g2_t{scalar_t::one()}; }
+  static constexpr __host__ dummy_g2_t generator() { return dummy_g2_t{scalar_t::one()}; }

-  static __host__ __device__ dummy_g2_t zero() { return dummy_g2_t{scalar_t::zero()}; }
+  static __host__ dummy_g2_t zero() { return dummy_g2_t{scalar_t::zero()}; }

-  friend __host__ __device__ dummy_g2_t operator*(const scalar_t& xs, const dummy_g2_t& ys)
+  friend __host__ dummy_g2_t operator*(const scalar_t& xs, const dummy_g2_t& ys)
  {
    return dummy_g2_t{scalar_t::reduce(scalar_t::mul_wide(xs, ys))};
  }

-  friend __host__ __device__ dummy_g2_t operator+(const dummy_g2_t& xs, const dummy_g2_t& ys)
+  friend __host__ dummy_g2_t operator+(const dummy_g2_t& xs, const dummy_g2_t& ys)
  {
    scalar_t rs = {};
    scalar_t::add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
Author	SHA1	Message	Date
IdoAtlas	42cffb1c88	vec ops compiles	2024-05-12 14:01:17 +03:00
IdoAtlas	d3274a9eaa	poseidon compiles	2024-05-12 13:43:47 +03:00
IdoAtlas	d31a7019fe	polynomial compiles	2024-05-12 13:28:11 +03:00
IdoAtlas	84a0d3c348	ntt compiled	2024-05-12 12:55:07 +03:00
IdoAtlas	eb87970325	msm compiled	2024-05-09 11:31:22 +03:00
IdoAtlas	a9081aabbf	build hash	2024-05-09 10:59:44 +03:00
IdoAtlas	b564c6670d	curve build	2024-05-09 10:45:01 +03:00
Otsar	1f9f3f13ea	cuda cpu empty mock, field compiles	2024-05-09 10:02:59 +03:00