refactor: generate curve-specific function names with macro instead of using objcopy to modify the symbols

2026-01-09 13:07:59 -05:00 · 2024-01-08 17:19:08 +02:00
parent b184befc99
commit 5fda751e93
14 changed files with 183 additions and 168 deletions
--- a/examples/c++/Poseidon-hash/example.cu
+++ b/examples/c++/Poseidon-hash/example.cu
@@ -7,37 +7,44 @@
 // expose Poseidon classes
 #include "curves/bls12_381/poseidon.cu"

-
-
 // location of a tree node in the array for a given level and offset
-inline uint32_t tree_index(uint32_t level, uint32_t offset) {
-    return (1 << level) - 1 + offset;
-}
+inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level) - 1 + offset; }

 // We assume the tree has leaves already set, compute all other levels
-void build_tree(const uint32_t tree_height, BLS12_381::scalar_t* tree, Poseidon<BLS12_381::scalar_t> &poseidon, cudaStream_t stream) {
-  for (uint32_t level = tree_height-1; level>0 ; level-- ) {
-    const uint32_t next_level = level -1;
+void build_tree(
+  const uint32_t tree_height, BLS12_381::scalar_t* tree, Poseidon<BLS12_381::scalar_t>& poseidon, cudaStream_t stream)
+{
+  for (uint32_t level = tree_height - 1; level > 0; level--) {
+    const uint32_t next_level = level - 1;
    const uint32_t next_level_width = 1 << next_level;
-    poseidon.hash_blocks(&tree[tree_index(level,0)], next_level_width, &tree[tree_index(next_level,0)], Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
-  }  
+    poseidon.hash_blocks(
+      &tree[tree_index(level, 0)], next_level_width, &tree[tree_index(next_level, 0)],
+      Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
+  }
 }

 // search leaves for a given hash, return offset
-uint32_t query_membership(BLS12_381::scalar_t query, BLS12_381::scalar_t* tree, const uint32_t tree_height) {
-  const uint32_t tree_width = (1 << (tree_height-1));
-  for (uint32_t i=0; i<tree_width; i++) {
+uint32_t query_membership(BLS12_381::scalar_t query, BLS12_381::scalar_t* tree, const uint32_t tree_height)
+{
+  const uint32_t tree_width = (1 << (tree_height - 1));
+  for (uint32_t i = 0; i < tree_width; i++) {
    const BLS12_381::scalar_t leaf = tree[tree_index(tree_height - 1, i)];
-    if (leaf == query ) {
+    if (leaf == query) {
      return i; // found the hash
-    }    
+    }
  }
-  return tree_height; // hash not found 
+  return tree_height; // hash not found
 }

-void generate_proof(uint32_t position, BLS12_381::scalar_t* tree, const uint32_t tree_height, uint32_t* proof_lr, BLS12_381::scalar_t* proof_hash ) {
+void generate_proof(
+  uint32_t position,
+  BLS12_381::scalar_t* tree,
+  const uint32_t tree_height,
+  uint32_t* proof_lr,
+  BLS12_381::scalar_t* proof_hash)
+{
  uint32_t level_index = position;
-  for(uint32_t level = tree_height - 1; level > 0; level--) {
+  for (uint32_t level = tree_height - 1; level > 0; level--) {
    uint32_t lr;
    uint32_t neighbour_index;
    lr = level_index % 2;
@@ -49,18 +56,25 @@ void generate_proof(uint32_t position, BLS12_381::scalar_t* tree, const uint32_t
      neighbour_index = level_index - 1;
    }
    proof_lr[level] = lr;
-    proof_hash[level] = tree[tree_index(level,neighbour_index)];
+    proof_hash[level] = tree[tree_index(level, neighbour_index)];
    level_index /= 2;
  }
  // the proof must match this:
-  proof_hash[0] = tree[tree_index(0,0)];
+  proof_hash[0] = tree[tree_index(0, 0)];
 }

-uint32_t validate_proof(const BLS12_381::scalar_t hash, const uint32_t tree_height, const uint32_t* proof_lr, const BLS12_381::scalar_t* proof_hash, Poseidon<BLS12_381::scalar_t> &poseidon, cudaStream_t stream) {
+uint32_t validate_proof(
+  const BLS12_381::scalar_t hash,
+  const uint32_t tree_height,
+  const uint32_t* proof_lr,
+  const BLS12_381::scalar_t* proof_hash,
+  Poseidon<BLS12_381::scalar_t>& poseidon,
+  cudaStream_t stream)
+{
  BLS12_381::scalar_t hashes_in[2], hash_out[1], level_hash;
  level_hash = hash;
-  for(uint32_t level = tree_height - 1; level > 0; level --) {
-    if(proof_lr[level]==0) {
+  for (uint32_t level = tree_height - 1; level > 0; level--) {
+    if (proof_lr[level] == 0) {
      hashes_in[0] = level_hash;
      hashes_in[1] = proof_hash[level];
    } else {
@@ -76,12 +90,11 @@ uint32_t validate_proof(const BLS12_381::scalar_t hash, const uint32_t tree_heig

 int main(int argc, char* argv[])
 {
-
  std::cout << "1. Defining the size of the example: height of the full binary Merkle tree" << std::endl;
  const uint32_t tree_height = 21;
  std::cout << "Tree height: " << tree_height << std::endl;
  const uint32_t tree_arity = 2;
-  const uint32_t leaf_level = tree_height-1;
+  const uint32_t leaf_level = tree_height - 1;
  const uint32_t tree_width = 1 << leaf_level;
  std::cout << "Tree width: " << tree_width << std::endl;
  const uint32_t tree_size = (1 << tree_height) - 1;
@@ -93,7 +106,8 @@ int main(int argc, char* argv[])
  std::cout << "Block size (arity): " << data_arity << std::endl;
  std::cout << "Initializing blocks..." << std::endl;
  BLS12_381::scalar_t d = BLS12_381::scalar_t::zero();
-  BLS12_381::scalar_t* data = static_cast<BLS12_381::scalar_t*>(malloc(tree_width * data_arity * sizeof(BLS12_381::scalar_t)));
+  BLS12_381::scalar_t* data =
+    static_cast<BLS12_381::scalar_t*>(malloc(tree_width * data_arity * sizeof(BLS12_381::scalar_t)));
  for (uint32_t i = 0; i < tree_width * data_arity; i++) {
    data[i] = d;
    d = d + BLS12_381::scalar_t::one();
@@ -103,31 +117,33 @@ int main(int argc, char* argv[])
  cudaStream_t stream;
  cudaStreamCreate(&stream);
  Poseidon<BLS12_381::scalar_t> data_poseidon(data_arity, stream);
-  data_poseidon.hash_blocks(data, tree_width, &tree[tree_index(leaf_level, 0)], Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
+  data_poseidon.hash_blocks(
+    data, tree_width, &tree[tree_index(leaf_level, 0)], Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);

  std::cout << "3. Building Merkle tree" << std::endl;
  Poseidon<BLS12_381::scalar_t> tree_poseidon(tree_arity, stream);
  build_tree(tree_height, tree, tree_poseidon, stream);

  std::cout << "4. Generate membership proof" << std::endl;
-  uint32_t position = tree_width-1;
+  uint32_t position = tree_width - 1;
  std::cout << "Using the hash for block: " << position << std::endl;
  BLS12_381::scalar_t query = tree[tree_index(leaf_level, position)];
  uint32_t query_position = query_membership(query, tree, tree_height);
  // allocate arrays for the proof
  uint32_t* proof_lr = static_cast<uint32_t*>(malloc(tree_height * sizeof(uint32_t)));
-  BLS12_381::scalar_t* proof_hash = static_cast<BLS12_381::scalar_t*>(malloc(tree_height * sizeof(BLS12_381::scalar_t)));
-  generate_proof(query_position, tree, tree_height, proof_lr, proof_hash );
+  BLS12_381::scalar_t* proof_hash =
+    static_cast<BLS12_381::scalar_t*>(malloc(tree_height * sizeof(BLS12_381::scalar_t)));
+  generate_proof(query_position, tree, tree_height, proof_lr, proof_hash);

  std::cout << "5. Validate the hash membership" << std::endl;
  uint32_t validated;
  const BLS12_381::scalar_t hash = tree[tree_index(leaf_level, query_position)];
-  validated = validate_proof(hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);  
+  validated = validate_proof(hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);
  std::cout << "Validated: " << validated << std::endl;

  std::cout << "6. Tamper the hash" << std::endl;
  const BLS12_381::scalar_t tampered_hash = hash + BLS12_381::scalar_t::one();
-  validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);  
+  validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);
  std::cout << "7. Invalidate tamper hash membership" << std::endl;
  std::cout << "Validated: " << validated << std::endl;
  return 0;
--- a/examples/c++/msm/example.cu
+++ b/examples/c++/msm/example.cu
@@ -2,7 +2,6 @@
 #include <fstream>
 #include <iostream>

-
 // include MSM template
 #include "appUtils/msm/msm.cu"
 // select the curve
@@ -42,7 +41,8 @@ int main(int argc, char* argv[])
  cudaStream_t stream1;
  cudaStreamCreate(&stream1);
  auto begin = std::chrono::high_resolution_clock::now();
-  large_msm<scalar_t, projective_t, affine_t>(scalars_d, points_d, msm_size, result_d, true, false, bucket_factor, stream1);
+  large_msm<scalar_t, projective_t, affine_t>(
+    scalars_d, points_d, msm_size, result_d, true, false, bucket_factor, stream1);
  auto end = std::chrono::high_resolution_clock::now();
  auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
  printf("On-device runtime: %.3f seconds.\n", elapsed.count() * 1e-9);
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -1,5 +1,5 @@
 #include <iostream>
-#include <iomanip> 
+#include <iomanip>
 #include <chrono>
 #include <cuda_runtime.h>
 #include <nvml.h>
@@ -15,51 +15,48 @@ const std::string curve = "BN254";
 #define MAX_THREADS_PER_BLOCK 256

 template <typename T>
-__global__ void vectorMult(T *vec_a, T *vec_b, T *vec_r, size_t n_elments)
+__global__ void vectorMult(T* vec_a, T* vec_b, T* vec_r, size_t n_elments)
 {
-    int tid = blockDim.x * blockIdx.x + threadIdx.x;
-    if (tid < n_elments)
-    {
-        vec_r[tid] = vec_a[tid] * vec_b[tid];
-    }
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < n_elments) { vec_r[tid] = vec_a[tid] * vec_b[tid]; }
 }

 template <typename T>
-int vector_mult(T *vec_b, T *vec_a, T *vec_result, size_t n_elments)
+int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments)
 {
-    // Set the grid and block dimensions
-    int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
-    int threads_per_block = MAX_THREADS_PER_BLOCK;
+  // Set the grid and block dimensions
+  int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
+  int threads_per_block = MAX_THREADS_PER_BLOCK;

-    // Call the kernel to perform element-wise modular multiplication
-    vectorMult<T><<<num_blocks, threads_per_block>>>(vec_a, vec_b, vec_result, n_elments);
-    return 0;
+  // Call the kernel to perform element-wise modular multiplication
+  vectorMult<T><<<num_blocks, threads_per_block>>>(vec_a, vec_b, vec_result, n_elments);
+  return 0;
 }

 int main(int argc, char** argv)
 {
  const unsigned vector_size = 1 << 20;
-  const unsigned repetitions = 1 << 20; 
+  const unsigned repetitions = 1 << 20;

  cudaError_t err;
  nvmlInit();
  nvmlDevice_t device;
-  nvmlDeviceGetHandleByIndex(0, &device);  // for GPU 0
+  nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0
  std::cout << "Icicle-Examples: vector multiplications" << std::endl;
  char name[NVML_DEVICE_NAME_BUFFER_SIZE];
  if (nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE) == NVML_SUCCESS) {
-        std::cout << "GPU Model: " << name << std::endl;
-    } else {
-        std::cerr << "Failed to get GPU model name." << std::endl;
-    }
+    std::cout << "GPU Model: " << name << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU model name." << std::endl;
+  }

  unsigned power_limit;
  nvmlDeviceGetPowerManagementLimit(device, &power_limit);
-  
+
  std::cout << "Vector size: " << vector_size << std::endl;
  std::cout << "Repetitions: " << repetitions << std::endl;
  std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
-  
+
  unsigned int baseline_power;
  nvmlDeviceGetPowerUsage(device, &baseline_power);
  std::cout << "Baseline power: " << std::fixed << std::setprecision(3) << 1.0e-3 * baseline_power << " W" << std::endl;
@@ -75,8 +72,7 @@ int main(int argc, char** argv)
  T* host_in2 = (T*)malloc(vector_size * sizeof(T));
  std::cout << "Initializing vectors with random data" << std::endl;
  for (int i = 0; i < vector_size; i++) {
-    if ( (i>0) && i % (1<<20) == 0) 
-      std::cout << "Elements: " << i << std::endl;
+    if ((i > 0) && i % (1 << 20) == 0) std::cout << "Elements: " << i << std::endl;
    host_in1[i] = T::rand_host();
    host_in2[i] = T::rand_host();
  }
@@ -89,40 +85,39 @@ int main(int argc, char** argv)
  err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));

  if (err != cudaSuccess) {
-      std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
-      return 0;
+    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+    return 0;
  }

  err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));

  if (err != cudaSuccess) {
-      std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
-      return 0;
+    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+    return 0;
  }

  err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));

  if (err != cudaSuccess) {
-      std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
-      return 0;
+    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+    return 0;
  }

  // copy from host to device
  err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);

  if (err != cudaSuccess) {
-      std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
-      return 0;
+    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
+    return 0;
  }

  err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);

  if (err != cudaSuccess) {
-      std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
-      return 0;
+    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
+    return 0;
  }

-  
  std::cout << "Starting warm-up" << std::endl;
  // Warm-up loop
  for (int i = 0; i < repetitions; i++) {
@@ -138,7 +133,8 @@ int main(int argc, char** argv)
  unsigned power_before;
  nvmlDeviceGetPowerUsage(device, &power_before);
  std::cout << "Power before: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_before << " W" << std::endl;
-  std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float) 100.0 * power_before / power_limit << " %" << std::endl;
+  std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_before / power_limit
+            << " %" << std::endl;
  unsigned temperature_before;
  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_before) == NVML_SUCCESS) {
    std::cout << "GPU Temperature before: " << temperature_before << " C" << std::endl;
@@ -156,14 +152,15 @@ int main(int argc, char** argv)
    //   return 0;
    // }
  }
-  
+
  auto end_time = std::chrono::high_resolution_clock::now();
  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
  std::cout << "Elapsed time: " << duration.count() << " microseconds" << std::endl;
  unsigned power_after;
  nvmlDeviceGetPowerUsage(device, &power_after);
  std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
-  std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float) 100.0 * power_after / power_limit << " %" << std::endl;
+  std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
+            << " %" << std::endl;
  unsigned temperature_after;
  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
    std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
@@ -172,19 +169,19 @@ int main(int argc, char** argv)
  }

  // Report performance in GMPS: Giga Multiplications Per Second
-  double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count()) ;
+  double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
  std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;

  // Optional: validate multiplication
-  T * host_out = (T*)malloc(vector_size * sizeof(T));
+  T* host_out = (T*)malloc(vector_size * sizeof(T));

  cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);

  // validate multiplication here...

-  free(host_in1); 
+  free(host_in1);
  free(host_in2);
-  free(host_out); 
+  free(host_out);
  cudaFree(device_in1);
  cudaFree(device_in2);
  cudaFree(device_out);
@@ -192,5 +189,4 @@ int main(int argc, char** argv)
  nvmlShutdown();

  return 0;
-  
 }
--- a/examples/c++/ntt/example.cu
+++ b/examples/c++/ntt/example.cu
@@ -1,7 +1,6 @@
 #include <chrono>
 #include <iostream>

-
 // include NTT template
 #include "icicle/appUtils/ntt/ntt.cuh"

@@ -13,29 +12,32 @@ using namespace BLS12_381;
 typedef scalar_t S;
 typedef scalar_t E;

-scalar_t smult(const unsigned n, scalar_t s) {
+scalar_t smult(const unsigned n, scalar_t s)
+{
  scalar_t r = scalar_t::zero();
-  for (unsigned i=0; i < n; i++) {
+  for (unsigned i = 0; i < n; i++) {
    r = r + s;
  }
  return r;
 }

-void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
+void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
+{
  // Harmonics 0
-  for (unsigned i = 0; i < ntt_size; i=i+1) {
+  for (unsigned i = 0; i < ntt_size; i = i + 1) {
    elements[i] = scalar_t::one();
  }
  // Harmonics 1
-  for (unsigned i = 1*ntt_size; i < 2*ntt_size; i=i+2) {
+  for (unsigned i = 1 * ntt_size; i < 2 * ntt_size; i = i + 2) {
    elements[i] = scalar_t::one();
-    elements[i+1] = scalar_t::neg(scalar_t::one());
+    elements[i + 1] = scalar_t::neg(scalar_t::one());
  }
 }

-int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
+int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
+{
  int nof_errors = 0;
-  E amplitude = smult(ntt_size,scalar_t::one());
+  E amplitude = smult(ntt_size, scalar_t::one());
  // std::cout << "Amplitude: " << amplitude << std::endl;
  // Harmonics 0
  if (elements[0] != amplitude) {
@@ -45,14 +47,14 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E * elemen
    std::cout << "Validated harmonics 0" << std::endl;
  }
  // Harmonics 1
-  if (elements[ntt_size+1] != amplitude) {
+  if (elements[ntt_size + 1] != amplitude) {
    ++nof_errors;
-    std::cout << "Error in harmonics 1: " << elements[ntt_size+1] << std::endl;
+    std::cout << "Error in harmonics 1: " << elements[ntt_size + 1] << std::endl;
  } else {
    std::cout << "Validated harmonics 1" << std::endl;
  }
  // for (unsigned i = 0; i < nof_ntts * ntt_size; i++) {
-  //   std::cout << elements[i] << std::endl;   
+  //   std::cout << elements[i] << std::endl;
  // }
  return nof_errors;
 }
@@ -68,35 +70,35 @@ int main(int argc, char* argv[])
  const unsigned nof_ntts = 2;
  std::cout << "Number of NTTs: " << nof_ntts << std::endl;
  const unsigned batch_size = nof_ntts * ntt_size;
-  
+
  std::cout << "Generating input data for harmonics 0,1" << std::endl;
  E* elements;
-  elements = (scalar_t*) malloc(sizeof(E) * batch_size);
-  initialize_input(ntt_size, nof_ntts, elements );
-  
+  elements = (scalar_t*)malloc(sizeof(E) * batch_size);
+  initialize_input(ntt_size, nof_ntts, elements);
+
  std::cout << "Running easy-to-use NTT" << std::endl;
  cudaStream_t stream;
  cudaStreamCreate(&stream);
-  
+
  bool inverse = false;
  auto begin0 = std::chrono::high_resolution_clock::now();
  ntt_end2end_batch_template<scalar_t, scalar_t>(elements, batch_size, ntt_size, inverse, stream);
  auto end0 = std::chrono::high_resolution_clock::now();
  auto elapsed0 = std::chrono::duration_cast<std::chrono::nanoseconds>(end0 - begin0);
  printf("On-device runtime: %.3f seconds\n", elapsed0.count() * 1e-9);
-  validate_output(ntt_size, nof_ntts, elements );
+  validate_output(ntt_size, nof_ntts, elements);
  cudaStreamSynchronize(stream);

  std::cout << "Running not that easy-to-use but fast NTT" << std::endl;
-  
+
  uint32_t n_twiddles = ntt_size; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order.
  // represent transform matrix using twiddle factors
-  scalar_t * d_twiddles;
+  scalar_t* d_twiddles;
  d_twiddles = fill_twiddle_factors_array(n_twiddles, scalar_t::omega(log_ntt_size), stream); // Sscalar
-  scalar_t* d_elements; // Element
-  
+  scalar_t* d_elements;                                                                       // Element
+
  cudaMallocAsync(&d_elements, sizeof(scalar_t) * batch_size, stream);
-  initialize_input(ntt_size, nof_ntts, elements );
+  initialize_input(ntt_size, nof_ntts, elements);
  cudaMemcpyAsync(d_elements, elements, sizeof(scalar_t) * batch_size, cudaMemcpyHostToDevice, stream);
  S* _null = nullptr;
  auto begin1 = std::chrono::high_resolution_clock::now();
@@ -106,12 +108,12 @@ int main(int argc, char* argv[])
  auto end1 = std::chrono::high_resolution_clock::now();
  auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
  printf("Runtime: %.3e seconds\n", elapsed1.count() * 1e-9);
-  
+
  cudaMemcpyAsync(elements, d_elements, sizeof(E) * batch_size, cudaMemcpyDeviceToHost, stream);
-  validate_output(ntt_size, nof_ntts, elements );
+  validate_output(ntt_size, nof_ntts, elements);
  cudaFreeAsync(d_elements, stream);
  cudaFreeAsync(d_twiddles, stream);
-  
+
  cudaStreamDestroy(stream);

  free(elements);
--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -64,6 +64,7 @@ project(icicle LANGUAGES CUDA CXX)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+include_directories("${CMAKE_SOURCE_DIR}")


 # when adding a new curve/field, append its name to the end of this list
@@ -98,35 +99,8 @@ if (NOT BUILD_TESTS)
    appUtils/msm/msm.cu
    appUtils/ntt/ntt.cu
  )
-  #set_target_properties(icicle PROPERTIES CUDA_SEPARABLE_COMPILATION ON) #TODO: enable asap
-  #set_target_properties(icicle PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-  target_compile_options(icicle PRIVATE -c)
-
-  add_custom_command(
-    TARGET icicle
-    POST_BUILD
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym MSMCuda=${CURVE}MSMCuda ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym DefaultMSMConfig=${CURVE}DefaultMSMConfig ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym NTTCuda=${CURVE}NTTCuda ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym GetDefaultNTTConfig=${CURVE}DefaultNTTConfig ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym InitializeDomain=${CURVE}InitializeDomain ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym GenerateScalars=${CURVE}GenerateScalars ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/field.cu.o
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym Eq=${CURVE}Eq ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym ToAffine=${CURVE}ToAffine ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym GenerateProjectivePoints=${CURVE}GenerateProjectivePoints ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym GenerateAffinePoints=${CURVE}GenerateAffinePoints ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym ScalarConvertMontgomery=${CURVE}ScalarConvertMontgomery ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/mont.cu.o
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym AffineConvertMontgomery=${CURVE}AffineConvertMontgomery ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/mont.cu.o
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym ProjectiveConvertMontgomery=${CURVE}ProjectiveConvertMontgomery ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/mont.cu.o
-    COMMAND ${CMAKE_AR} ARGS -rcs ${PROJECT_BINARY_DIR}/libingo_${CURVE}.a 
-    ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/vec_ops.cu.o
-    ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/field.cu.o
-    ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
-    ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o 
-    ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
-    ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/mont.cu.o
-                                                                                 
-  )
+  set_target_properties(icicle PROPERTIES OUTPUT_NAME "ingo_${CURVE}")
+  target_compile_definitions(icicle PRIVATE CURVE=${CURVE})  

 else()

--- a/icicle/appUtils/msm/Makefile
+++ b/icicle/appUtils/msm/Makefile
@@ -1,4 +1,4 @@
 test_msm:
 	mkdir -p work
-	nvcc -o work/test_msm -I. tests/msm_test.cu
+	nvcc -o work/test_msm -I. -I../.. tests/msm_test.cu
 	work/test_msm
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
@@ -10,13 +10,14 @@
 #include <stdexcept>
 #include <vector>

-#include "../../curves/curve_config.cuh"
-#include "../../primitives/affine.cuh"
-#include "../../primitives/field.cuh"
-#include "../../primitives/projective.cuh"
-#include "../../utils/cuda_utils.cuh"
-#include "../../utils/error_handler.cuh"
-#include "../../utils/mont.cuh"
+#include "curves/curve_config.cuh"
+#include "primitives/affine.cuh"
+#include "primitives/field.cuh"
+#include "primitives/projective.cuh"
+#include "utils/cuda_utils.cuh"
+#include "utils/error_handler.cuh"
+#include "utils/mont.cuh"
+#include "utils/utils.h"

 namespace msm {

@@ -966,7 +967,7 @@ namespace msm {

  } // namespace

-  extern "C" MSMConfig DefaultMSMConfig()
+  extern "C" MSMConfig CONCAT_EXPAND(CURVE, DefaultMSMConfig)()
  {
    device_context::DeviceContext ctx = device_context::get_default_device_context();
    MSMConfig config = {
@@ -1016,7 +1017,7 @@ namespace msm {
   *  - `P` is the [projective representation](@ref projective_t) of curve points.
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   */
-  extern "C" cudaError_t MSMCuda(
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, MSMCuda)(
    curve_config::scalar_t* scalars,
    curve_config::affine_t* points,
    int msm_size,
--- a/icicle/appUtils/ntt/ntt.cu
+++ b/icicle/appUtils/ntt/ntt.cu
@@ -3,9 +3,10 @@
 #include <unordered_map>
 #include <vector>

-#include "../../curves/curve_config.cuh"
-#include "../../utils/sharedmem.cuh"
-#include "../../utils/utils_kernels.cuh"
+#include "curves/curve_config.cuh"
+#include "utils/sharedmem.cuh"
+#include "utils/utils_kernels.cuh"
+#include "utils/utils.h"

 namespace ntt {

@@ -508,7 +509,7 @@ namespace ntt {
   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
   * @return Default [NTTConfig](@ref NTTConfig).
   */
-  extern "C" NTTConfig<curve_config::scalar_t> GetDefaultNTTConfig()
+  extern "C" NTTConfig<curve_config::scalar_t> CONCAT_EXPAND(CURVE, GetDefaultNTTConfig)()
  {
    return DefaultNTTConfig<curve_config::scalar_t>();
  }
@@ -518,7 +519,8 @@ namespace ntt {
   * value of template parameter (where the curve is given by `-DCURVE` env variable during build):
   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
   */
-  extern "C" cudaError_t InitializeDomain(curve_config::scalar_t primitive_root, device_context::DeviceContext& ctx)
+  extern "C" cudaError_t
+  CONCAT_EXPAND(CURVE, InitializeDomain)(curve_config::scalar_t primitive_root, device_context::DeviceContext& ctx)
  {
    return InitDomain(primitive_root, ctx);
  }
@@ -529,7 +531,7 @@ namespace ntt {
   *  - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve;
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   */
-  extern "C" cudaError_t NTTCuda(
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, NTTCuda)(
    curve_config::scalar_t* input,
    int size,
    NTTDir dir,
--- a/icicle/appUtils/ntt/ntt.cuh
+++ b/icicle/appUtils/ntt/ntt.cuh
@@ -4,11 +4,12 @@

 #include <cuda_runtime.h>

-#include "../../curves/curve_config.cuh"
-#include "../../utils/device_context.cuh"
-#include "../../utils/error_handler.cuh"
-#include "../../utils/sharedmem.cuh"
-#include "../../utils/utils_kernels.cuh"
+#include "curves/curve_config.cuh"
+#include "utils/device_context.cuh"
+#include "utils/error_handler.cuh"
+#include "utils/sharedmem.cuh"
+#include "utils/utils_kernels.cuh"
+#include "utils/utils.h"

 /**
 * @namespace ntt
@@ -86,7 +87,7 @@ namespace ntt {
   * @return Default value of [NTTConfig](@ref NTTConfig).
   */
  template <typename S>
-  NTTConfig<S> DefaultNTTConfig();
+  NTTConfig<S> CONCAT_EXPAND(CURVE, DefaultNTTConfig)();

  /**
   * A function that computes NTT or iNTT in-place.
--- a/icicle/primitives/field.cu
+++ b/icicle/primitives/field.cu
@@ -1,6 +1,10 @@
-#include "../curves/curve_config.cuh"
+#include "curves/curve_config.cuh"
 #include "field.cuh"
+#include "utils/utils.h"

 #define scalar_t curve_config::scalar_t

-extern "C" void GenerateScalars(scalar_t* scalars, int size) { scalar_t::RandHostMany(scalars, size); }
+extern "C" void CONCAT_EXPAND(CURVE, GenerateScalars)(scalar_t* scalars, int size)
+{
+  scalar_t::RandHostMany(scalars, size);
+}
--- a/icicle/primitives/projective.cu
+++ b/icicle/primitives/projective.cu
@@ -1,12 +1,13 @@
-#include "../curves/curve_config.cuh"
+#include "curves/curve_config.cuh"
 #include "projective.cuh"
 #include <cuda.h>
+#include "utils/utils.h"

 #define projective_t  curve_config::projective_t // TODO: global to avoid lengthy texts
 #define affine_t      curve_config::affine_t
 #define point_field_t curve_config::point_field_t

-extern "C" bool Eq(projective_t* point1, projective_t* point2)
+extern "C" bool CONCAT_EXPAND(CURVE, Eq)(projective_t* point1, projective_t* point2)
 {
  return (*point1 == *point2) &&
         !((point1->x == point_field_t::zero()) && (point1->y == point_field_t::zero()) &&
@@ -15,11 +16,20 @@ extern "C" bool Eq(projective_t* point1, projective_t* point2)
           (point2->z == point_field_t::zero()));
 }

-extern "C" void ToAffine(projective_t* point, affine_t* point_out) { *point_out = projective_t::to_affine(*point); }
+extern "C" void CONCAT_EXPAND(CURVE, ToAffine)(projective_t* point, affine_t* point_out)
+{
+  *point_out = projective_t::to_affine(*point);
+}

-extern "C" void GenerateProjectivePoints(projective_t* points, int size) { projective_t::RandHostMany(points, size); }
+extern "C" void CONCAT_EXPAND(CURVE, GenerateProjectivePoints)(projective_t* points, int size)
+{
+  projective_t::RandHostMany(points, size);
+}

-extern "C" void GenerateAffinePoints(affine_t* points, int size) { projective_t::RandHostManyAffine(points, size); }
+extern "C" void CONCAT_EXPAND(CURVE, GenerateAffinePoints)(affine_t* points, int size)
+{
+  projective_t::RandHostManyAffine(points, size);
+}

 #if defined(G2_DEFINED)

--- a/icicle/utils/mont.cu
+++ b/icicle/utils/mont.cu
@@ -1,10 +1,11 @@
-#include "../curves/curve_config.cuh"
+#include "curves/curve_config.cuh"
 #include "device_context.cuh"
 #include "mont.cuh"
+#include "utils/utils.h"

 namespace mont {
-  extern "C" cudaError_t
-  ScalarConvertMontgomery(curve_config::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, ScalarConvertMontgomery)(
+    curve_config::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
  {
    if (is_into) {
      return ToMontgomery(d_inout, n, ctx.stream, d_inout);
@@ -13,8 +14,8 @@ namespace mont {
    }
  }

-  extern "C" cudaError_t
-  AffineConvertMontgomery(curve_config::affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, AffineConvertMontgomery)(
+    curve_config::affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
  {
    if (is_into) {
      return ToMontgomery(d_inout, n, ctx.stream, d_inout);
@@ -23,7 +24,7 @@ namespace mont {
    }
  }

-  extern "C" cudaError_t ProjectiveConvertMontgomery(
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, ProjectiveConvertMontgomery)(
    curve_config::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
  {
    if (is_into) {
--- a/icicle/utils/utils.h
+++ b/icicle/utils/utils.h
@@ -0,0 +1,8 @@
+#pragma once
+#ifndef ICICLE_UTILS_H
+#define ICICLE_UTILS_H
+
+#define CONCAT_DIRECT(a, b) a##b
+#define CONCAT_EXPAND(a, b) CONCAT_DIRECT(a, b) // expand a,b before concatenation
+
+#endif // ICICLE_UTILS_H
--- a/wrappers/rust/icicle-core/src/ntt/mod.rs
+++ b/wrappers/rust/icicle-core/src/ntt/mod.rs
@@ -149,7 +149,7 @@ macro_rules! impl_ntt {
                output: *mut $field,
            ) -> CudaError;

-            #[link_name = concat!($field_prefix, "DefaultNTTConfig")]
+            #[link_name = concat!($field_prefix, "GetDefaultNTTConfig")]
            fn default_ntt_config() -> NTTConfig<'static, $field>;

            #[link_name = concat!($field_prefix, "InitializeDomain")]