feat: add num_bits() function (#570 )

## Describe the changes adding a `num_bits()` function similar to dcf73a5f96/ff/src/biginteger/mod.rs (L482) this could be useful for small field optimizations ## Linked Issues Resolves #
Removed ZK containers from docs sidebar (#571 )
2026-01-12 08:58:09 -05:00 · 2024-08-07 09:37:16 +03:00 · 2024-08-04 18:38:37 +03:00 · 2024-08-04 11:14:06 +03:00 · 2024-08-01 14:58:02 +03:00 · 2024-07-28 15:31:28 +07:00
52 changed files with 371 additions and 1219 deletions
--- a/docs/docs/icicle/primitives/keccak.md
+++ b/docs/docs/icicle/primitives/keccak.md
@@ -12,6 +12,10 @@ At its core, Keccak consists of a permutation function operating on a state arra
 - **Chi:** This step applies a nonlinear mixing operation to each lane of the state array.
 - **Iota:** This step introduces a round constant to the state array.

+## Keccak vs Sha3
+
+There exists a [confusion](https://www.cybertest.com/blog/keccak-vs-sha3) between what is called `Keccak` and `Sha3`. In ICICLE we support both. `Keccak256` relates to the old hash function used in Ethereum, and `Sha3-256` relates to the modern hash function.
+
 ## Using Keccak

 ICICLE Keccak supports batch hashing, which can be utilized for constructing a merkle tree or running multiple hashes in parallel.
@@ -35,7 +39,7 @@ let input_block_len = 136;
 let number_of_hashes = 1024;

 let preimages = vec![1u8; number_of_hashes * input_block_len];
-let mut digests = vec![0u8; number_of_hashes * 64];
+let mut digests = vec![0u8; number_of_hashes * 32];

 let preimages_slice = HostSlice::from_slice(&preimages);
 let digests_slice = HostSlice::from_mut_slice(&mut digests);
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -191,11 +191,6 @@ module.exports = {
        },
      ]
    },
-    {
-      type: "doc",
-      label: "ZK Containers",
-      id: "ZKContainers",
-    },
    {
      type: "doc",
      label: "Ingonyama Grant program",
--- a/examples/c++/mont_vec_ops/CMakeLists.txt
+++ b/examples/c++/mont_vec_ops/CMakeLists.txt
@@ -1,24 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
-set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
-    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
-else()
-    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
-endif ()
-project(example LANGUAGES CUDA CXX)
-
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-add_executable(
-  example
-  example.cu
-)
-target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
-find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
-target_link_libraries(example ${NVML_LIBRARY})
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/mont_vec_ops/README.md
+++ b/examples/c++/mont_vec_ops/README.md
@@ -1,38 +0,0 @@
-#Icicle example : Montgomery vector operations(mul, add, sub) for allpossible options:
-is_a_on_device
-is_b_on_device
-is_result_on_device
-is_in_montgomery_form
-(is_async isn't checked)
-
-
-## Key-Takeaway
-
-`Icicle` accelerates multiplication operation `*` using [Karatsuba algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
-
-## Concise Usage Explanation
-
-Define field to be used, e. g.:
-
-```c++
-#include "api/bn254.h"
-```
-
-```c++
-using namespace bn254;
-typedef scalar_t T;
-```
-
-## Running the example
-
- `cd` to your example directory
- compile with `./compile.sh`
- run with `./run.sh`
-
-## What's in the example
-
-1. Define the parameters for the example such as vector size 
-2. Generate random vectors on-host
-3. Copy them on-device
-4. Execute element-wise vector multiplication on-device
-5. Copy results on-host
--- a/examples/c++/mont_vec_ops/compile.debug.sh
+++ b/examples/c++/mont_vec_ops/compile.debug.sh
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-# Exit immediately on error
-set -e
-
-mkdir -p build/example
-mkdir -p build/icicle
-
-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Debug -DCURVE=bn254
-cmake --build build/icicle -j
-
-# Configure and build the example application
-cmake -DCMAKE_BUILD_TYPE=Debug -S. -B build/example
-cmake --build build/example
--- a/examples/c++/mont_vec_ops/compile.sh
+++ b/examples/c++/mont_vec_ops/compile.sh
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-# Exit immediately on error
-set -e
-
-mkdir -p build/example
-mkdir -p build/icicle
-
-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
-cmake --build build/icicle
-
-# Configure and build the example application
-cmake -S . -B build/example
-cmake --build build/example
--- a/examples/c++/mont_vec_ops/example.cu
+++ b/examples/c++/mont_vec_ops/example.cu
@@ -1,734 +0,0 @@
-#include <iostream>
-#include <iomanip>
-#include <chrono>
-#include <nvml.h>
-
-#include "api/bn254.h"
-#include "vec_ops/vec_ops.cuh"
-#include <vec_ops/../../include/utils/mont.cuh>
-
-using namespace vec_ops;
-using namespace bn254;
-
-typedef scalar_t T;
-
-enum Op { MUL, ADD, SUB, LAST };
-
-// bn254 p = 21888242871839275222246405745257275088548364400416034343698204186575808495617
-
-int vector_op(
-  T* vec_a,
-  T* vec_b,
-  T* vec_result,
-  size_t n_elements,
-  device_context::DeviceContext ctx,
-  vec_ops::VecOpsConfig config,
-  Op op)
-{
-  cudaError_t err;
-  switch (op) {
-  case MUL:
-    err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result);
-    break;
-  case ADD:
-    err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result);
-    break;
-  case SUB:
-    err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result);
-    break;
-  }
-  // cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
-  return 0;
-}
-int vector_mul(
-  T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
-{
-  cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
-  return 0;
-}
-int vector_add(
-  T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
-{
-  cudaError_t err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
-  return 0;
-}
-int vector_sub(
-  T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
-{
-  cudaError_t err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
-  return 0;
-}
-
-int main(int argc, char** argv)
-{
-  const unsigned vector_size = 1 << 0;
-  const unsigned repetitions = 1 << 0;
-
-  cudaError_t err;
-  nvmlInit();
-  nvmlDevice_t device;
-  nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0
-  std::cout << "Icicle-Examples: vector mul / add / sub operations." << std::endl;
-  char name[NVML_DEVICE_NAME_BUFFER_SIZE];
-  if (nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE) == NVML_SUCCESS) {
-    std::cout << "GPU Model: " << name << std::endl;
-  } else {
-    std::cerr << "Failed to get GPU model name." << std::endl;
-  }
-  unsigned power_limit;
-  nvmlDeviceGetPowerManagementLimit(device, &power_limit);
-
-  std::cout << "Vector size: " << vector_size << std::endl;
-  std::cout << "Repetitions: " << repetitions << std::endl;
-  std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
-
-  unsigned int baseline_power;
-  nvmlDeviceGetPowerUsage(device, &baseline_power);
-  std::cout << "Baseline power: " << std::fixed << std::setprecision(3) << 1.0e-3 * baseline_power << " W" << std::endl;
-  unsigned baseline_temperature;
-  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &baseline_temperature) == NVML_SUCCESS) {
-    std::cout << "Baseline GPU Temperature: " << baseline_temperature << " C" << std::endl;
-  } else {
-    std::cerr << "Failed to get GPU temperature." << std::endl;
-  }
-
-  // host data
-  std::cout << "Allocate memory for the input vectors (both normal and Montgomery presentation)" << std::endl;
-  T* host_in1_init = (T*)malloc(vector_size * sizeof(T));
-  T* host_in2_init = (T*)malloc(vector_size * sizeof(T));
-  std::cout << "Initializing vectors with normal presentation random data" << std::endl;
-  T::rand_host_many(host_in1_init, vector_size);
-  T::rand_host_many(host_in2_init, vector_size);
-  std::cout << "Allocate memory for the output vectors" << std::endl;
-  T* host_out = (T*)malloc(vector_size * sizeof(T)); // This memory will be used for the test output.
-  T* host_out_ref_mul = (T*)malloc(
-    vector_size *
-    sizeof(T)); // This memory will be used as a reference result for mul (will be compared to host_out content).
-  T* host_out_ref_add = (T*)malloc(
-    vector_size *
-    sizeof(T)); // This memory will be used as a reference result for add (will be compared to host_out content).
-  T* host_out_ref_sub = (T*)malloc(
-    vector_size *
-    sizeof(T)); // This memory will be used as a reference result for sub (will be compared to host_out content).
-  std::cout << "Initializing output vectors with random data" << std::endl;
-  T::rand_host_many(host_out, vector_size);
-  T::rand_host_many(host_out_ref_mul, vector_size);
-  T::rand_host_many(host_out_ref_add, vector_size);
-  T::rand_host_many(host_out_ref_sub, vector_size);
-  // device data
-  device_context::DeviceContext ctx = device_context::get_default_device_context();
-  T* device_in1;
-  T* device_in2;
-  T* device_out;
-
-  err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
-  err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
-  err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
-
-  vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
-
-  //****************************************
-  // Test warn-up and reference output config. Reference output to be used to check if test passed or not.
-  //****************************************
-  // copy from host to device
-  err = cudaMemcpy(device_in1, host_in1_init, vector_size * sizeof(T), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
-  err = cudaMemcpy(device_in2, host_in2_init, vector_size * sizeof(T), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
-  std::cout << "Starting warm-up run" << std::endl;
-  // Warm-up loop
-  for (int op = MUL; op != LAST; op++) {
-    for (int i = 0; i < repetitions; i++) {
-      // vector_mul(device_in1, device_in2, device_out, vector_size, ctx, config);
-      vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
-    }
-    switch (op) {
-    case MUL:
-      err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-      break;
-    case ADD:
-      err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-      break;
-    case SUB:
-      err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-      break;
-    }
-  }
-  // copy the result from device to host_out_ref_mul to keep it for later comparisons.
-  // err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
-  //****************************************
-  // End of test warn-up and reference output config.
-  //****************************************
-
-  std::cout << "Starting benchmarking" << std::endl;
-  unsigned power_before;
-  nvmlDeviceGetPowerUsage(device, &power_before);
-  std::cout << "Power before: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_before << " W" << std::endl;
-  std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_before / power_limit
-            << " %" << std::endl;
-  unsigned temperature_before;
-  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_before) == NVML_SUCCESS) {
-    std::cout << "GPU Temperature before: " << temperature_before << " C" << std::endl;
-  } else {
-    std::cerr << "Failed to get GPU temperature." << std::endl;
-  }
-
-  //*******************************************************
-  // Benchmark test:
-  // Loop for (mul, add, sub):
-  //   Loop (is_a_on_device, is_b_on_device, is_result_on_device, is_in_montgomery_form):
-  //*******************************************************
-  T* host_in1 =
-    (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in1_init for the benchmark.
-  T* host_in2 =
-    (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in2_init for the benchmark.
-  // Test when the result is not in-place
-  for (int op = MUL; op != LAST; op++) {
-    // for (int config_idx = 0; config_idx < 0; config_idx++) {
-    for (int config_idx = 0; config_idx < 16; config_idx++) {
-      std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl;
-      for (int i = 0; i < vector_size; i++) {
-        host_in1[i] = host_in1_init[i];
-        host_in2[i] = host_in2_init[i];
-      }
-      config.is_a_on_device = (config_idx >> 3) & 0x1;
-      config.is_b_on_device = (config_idx >> 2) & 0x1;
-      config.is_result_on_device = (config_idx >> 1) & 0x1;
-      config.is_in_montgomery_form = (config_idx >> 0) & 0x1;
-
-      // Copy from host to device (copy again in order to be used later in the loop and device_inX was already
-      // overwritten by warmup.
-      if (config.is_a_on_device) {
-        if (config.is_in_montgomery_form) {
-          err =
-            cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-          CHK_IF_RETURN(
-            mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place.
-        } else {                                                                          // Normal presentation.
-          err =
-            cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-        }
-      } else {
-        if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
-          err =
-            cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-          CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1));
-          err = cudaMemcpy(host_in1, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from device_in1 to host_in1 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-        }
-      }
-      if (config.is_b_on_device) {
-        if (config.is_in_montgomery_form) {
-          err =
-            cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-          CHK_IF_RETURN(
-            mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place.
-        } else {
-          // Normal presentation.
-          err =
-            cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-        }
-      } else {
-        if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
-          err =
-            cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-          CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2));
-          err = cudaMemcpy(host_in2, device_in2, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from device_in2 to host_in2 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-        }
-      }
-      CHK_IF_RETURN(cudaPeekAtLastError());
-
-      auto start_time = std::chrono::high_resolution_clock::now();
-      // Benchmark loop
-      for (int i = 0; i < repetitions; i++) {
-        switch (config_idx >> 1) { // {is_a_on_device, is_b_on_device, is_result_on_device}
-        case 0b000:
-          vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b001:
-          vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b010:
-          vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b011:
-          vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b100:
-          vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b101:
-          vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b110:
-          vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b111:
-          vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
-          break;
-        }
-        CHK_IF_RETURN(cudaPeekAtLastError());
-      }
-
-      auto end_time = std::chrono::high_resolution_clock::now();
-      auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
-      switch (op) {
-      case MUL:
-        std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx "
-                  << config_idx << " and result not in-place" << std::endl;
-        break;
-      case ADD:
-        std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx "
-                  << config_idx << " and result not in-place" << std::endl;
-        break;
-      case SUB:
-        std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx "
-                  << config_idx << " and result not in-place" << std::endl;
-        break;
-      }
-
-      if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value.
-        if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed.
-          CHK_IF_RETURN(mont::from_montgomery(
-            device_out, vector_size, config.ctx.stream,
-            device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
-        }
-        err = cudaMemcpy(
-          host_out, device_out, vector_size * sizeof(T),
-          cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
-        if (err != cudaSuccess) {
-          std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
-          return 0;
-        }
-      } else {                              // Data is not on device but it is in host_out.
-        if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and
-                                            // written back to host. Then compared vs. host_out_ref_mul.
-          err = cudaMemcpy(
-            device_out, host_out, vector_size * sizeof(T),
-            cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-          CHK_IF_RETURN(mont::from_montgomery(
-            device_out, vector_size, config.ctx.stream,
-            device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
-          err = cudaMemcpy(
-            host_out, device_out, vector_size * sizeof(T),
-            cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-        } else { // host_out could be compared vs. host_out_ref_mul as is.
-        }
-      }
-      //****************************************
-      // End of benchmark test.
-      //****************************************
-
-      //***********************************************
-      // Test result check
-      // Check is performed by executing the operation in a normal presentation
-      //   (located in in host_out_ref_mul) and comparing it with the
-      //   benchmark test result.
-      //***********************************************
-      int test_failed = 0;
-      // std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl;
-      // std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl;
-      switch (op) {
-      case MUL:
-        for (int i = 0; i < vector_size; i++) {
-          if (host_out_ref_mul[i] != host_out[i]) {
-            std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i
-                      << ", config is printed below:" << std::endl;
-            test_failed = 1;
-          }
-        }
-        break;
-      case ADD:
-        for (int i = 0; i < vector_size; i++) {
-          if (host_out_ref_add[i] != host_out[i]) {
-            std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i
-                      << ", config is printed below:" << std::endl;
-            test_failed = 1;
-          }
-        }
-        break;
-      case SUB:
-        for (int i = 0; i < vector_size; i++) {
-          if (host_out_ref_sub[i] != host_out[i]) {
-            std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i
-                      << ", config is printed below:" << std::endl;
-            test_failed = 1;
-          }
-        }
-        break;
-      }
-      if (test_failed) {
-        // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" <<
-        // std::endl;
-        std::cout << "===>>> result is not in-place: " << std::endl;
-        std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl;
-        std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl;
-        std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl;
-        std::cout << "===>>> is_in_montgomery_form: " << config.is_in_montgomery_form << std::endl;
-        exit(2);
-      }
-
-      unsigned power_after;
-      nvmlDeviceGetPowerUsage(device, &power_after);
-      std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
-      std::cout << "Power utilization: " << std::fixed << std::setprecision(1)
-                << (float)100.0 * power_after / power_limit << " %" << std::endl;
-      unsigned temperature_after;
-      if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
-        std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
-      } else {
-        std::cerr << "Failed to get GPU temperature." << std::endl;
-      }
-
-      // Report performance in GMPS: Giga Multiplications Per Second
-      double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
-      std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
-    }
-  }
-
-  // Test when the result is in-place
-  for (int op = MUL; op != LAST; op++) {
-    for (int config_idx = 0; config_idx < 16; config_idx++) {
-      for (int i = 0; i < vector_size; i++) {
-        host_in1[i] = host_in1_init[i];
-        host_in2[i] = host_in2_init[i];
-      }
-      config.is_a_on_device = (config_idx >> 4) & 0x1;
-      config.is_b_on_device = (config_idx >> 3) & 0x1;
-      config.is_result_on_device = (config_idx >> 2) & 0x1;
-      config.is_in_montgomery_form = (config_idx >> 1) & 0x1;
-      if (config.is_a_on_device ^ config.is_result_on_device == 1) { continue; }
-
-      // Copy from host to device (copy again in order to be used later in the loop and device_inX was already
-      // overwritten by warmup.
-      if (config.is_a_on_device) {
-        if (config.is_in_montgomery_form) {
-          err =
-            cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-          CHK_IF_RETURN(
-            mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place.
-        } else {                                                                          // Normal presentation.
-          err =
-            cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-        }
-      } else {
-        if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
-          err =
-            cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-          CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1));
-          err = cudaMemcpy(host_in1, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from device_in1 to host_in1 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-        }
-      }
-      if (config.is_b_on_device) {
-        if (config.is_in_montgomery_form) {
-          err =
-            cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-          CHK_IF_RETURN(
-            mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place.
-        } else {
-          // Normal presentation.
-          err =
-            cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-        }
-      } else {
-        if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
-          err =
-            cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-          CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2));
-          err = cudaMemcpy(host_in2, device_in2, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from device_in2 to host_in2 - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-        }
-      }
-      CHK_IF_RETURN(cudaPeekAtLastError());
-
-      auto start_time = std::chrono::high_resolution_clock::now();
-      // Benchmark loop
-      for (int i = 0; i < repetitions; i++) {
-        switch (config_idx >> 2) { // {is_a_on_device, is_b_on_device, is_result_on_device}
-        case 0b000:
-          vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b001:
-          break;
-        case 0b010:
-          vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b011:
-          break;
-        case 0b100:
-          break;
-        case 0b101:
-          vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b110:
-          break;
-        case 0b111:
-          vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op);
-          break;
-        }
-        CHK_IF_RETURN(cudaPeekAtLastError());
-      }
-
-      auto end_time = std::chrono::high_resolution_clock::now();
-      auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
-      switch (op) {
-      case MUL:
-        std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx "
-                  << config_idx << " and result in-place" << std::endl;
-        break;
-      case ADD:
-        std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx "
-                  << config_idx << " and result in-place" << std::endl;
-        break;
-      case SUB:
-        std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx "
-                  << config_idx << " and result in-place" << std::endl;
-        break;
-      }
-
-      if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value.
-        if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed.
-          CHK_IF_RETURN(mont::from_montgomery(
-            device_in1, vector_size, config.ctx.stream,
-            device_in1)); // Convert to normal in order to check vs. host_out_ref_mul.
-        }
-        err = cudaMemcpy(
-          host_out, device_in1, vector_size * sizeof(T),
-          cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
-        if (err != cudaSuccess) {
-          std::cerr << "Failed to copy data from device_in1 to host_out - " << cudaGetErrorString(err) << std::endl;
-          return 0;
-        }
-      } else { // Data is not on device but it is in host_in1. It should be moved to host_out for test pass/fail check.
-        if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and
-                                            // written back to host. Then compared vs. host_out_ref_mul.
-          err = cudaMemcpy(
-            device_out, host_in1, vector_size * sizeof(T),
-            cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-          CHK_IF_RETURN(mont::from_montgomery(
-            device_out, vector_size, config.ctx.stream,
-            device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
-          err = cudaMemcpy(
-            host_out, device_out, vector_size * sizeof(T),
-            cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-        } else { // host_out could be compared vs. host_out_ref_mul as is.
-          err = cudaMemcpy(
-            device_out, host_in1, vector_size * sizeof(T),
-            cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-          err = cudaMemcpy(
-            host_out, device_out, vector_size * sizeof(T),
-            cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-        }
-      }
-      //****************************************
-      // End of benchmark test.
-      //****************************************
-
-      //***********************************************
-      // Test result check
-      // Check is performed by executing the operation in a normal presentation
-      //   (located in in host_out_ref_mul) and comparing it with the
-      //   benchmark test result.
-      //***********************************************
-      int test_failed = 0;
-      // std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl;
-      // std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl;
-      switch (op) {
-      case MUL:
-        for (int i = 0; i < vector_size; i++) {
-          if (host_out_ref_mul[i] != host_out[i]) {
-            std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i
-                      << ", config is printed below:" << std::endl;
-            std::cout << "host_out_ref_mul[0] = " << host_out_ref_mul[0] << std::endl;
-            test_failed = 1;
-          }
-        }
-        break;
-      case ADD:
-        for (int i = 0; i < vector_size; i++) {
-          if (host_out_ref_add[i] != host_out[i]) {
-            std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i
-                      << ", config is printed below:" << std::endl;
-            std::cout << "host_out_ref_add[0] = " << host_out_ref_add[0] << std::endl;
-            test_failed = 1;
-          }
-        }
-        break;
-      case SUB:
-        for (int i = 0; i < vector_size; i++) {
-          if (host_out_ref_sub[i] != host_out[i]) {
-            std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i
-                      << ", config is printed below:" << std::endl;
-            std::cout << "host_out_ref_sub[0] = " << host_out_ref_sub[0] << std::endl;
-            test_failed = 1;
-          }
-        }
-        break;
-      }
-      if (test_failed) {
-        // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" <<
-        // std::endl;
-        std::cout << "===>>> result is in-place: " << std::endl;
-        std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl;
-        std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl;
-        std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl;
-        std::cout << "===>>> is_in_montgomery_form: " << config.is_in_montgomery_form << std::endl;
-        std::cout << "host_out[0] = " << host_out[0] << std::endl;
-        exit(2);
-      }
-
-      unsigned power_after;
-      nvmlDeviceGetPowerUsage(device, &power_after);
-      std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
-      std::cout << "Power utilization: " << std::fixed << std::setprecision(1)
-                << (float)100.0 * power_after / power_limit << " %" << std::endl;
-      unsigned temperature_after;
-      if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
-        std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
-      } else {
-        std::cerr << "Failed to get GPU temperature." << std::endl;
-      }
-
-      // Report performance in GMPS: Giga Multiplications Per Second
-      double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
-      std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
-    }
-  }
-
-  // clean up and exit
-  free(host_in1_init);
-  free(host_in2_init);
-  free(host_in1);
-  free(host_in2);
-  free(host_out);
-  free(host_out_ref_mul);
-  cudaFree(device_in1);
-  cudaFree(device_in2);
-  cudaFree(device_out);
-  nvmlShutdown();
-  return 0;
-}
--- a/examples/c++/mont_vec_ops/run.sh
+++ b/examples/c++/mont_vec_ops/run.sh
@@ -1,2 +0,0 @@
-#! /bin/bash
-./build/example/example
--- a/examples/c++/msm/README.md
+++ b/examples/c++/msm/README.md
@@ -1,9 +1,5 @@
 # Icicle example: Muli-Scalar Multiplication (MSM)

-## Best-Practices
-
-We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
-
 ## Key-Takeaway

 `Icicle` provides CUDA C++ template function `MSM` to accelerate [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
--- a/examples/c++/multiply/README.md
+++ b/examples/c++/multiply/README.md
@@ -1,9 +1,5 @@
 # Icicle example: Multiplication

-## Best-Practices
-
-We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
-
 ## Key-Takeaway

 `Icicle` accelerates multiplication operation `*` using [Karatsuba algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
--- a/examples/c++/ntt/README.md
+++ b/examples/c++/ntt/README.md
@@ -1,9 +1,5 @@
 # Icicle example: Number-Theoretical Transform (NTT)

-## Best-Practices
-
-We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
-
 ## Key-Takeaway

 `Icicle` provides CUDA C++ template function NTT for [Number Theoretical Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md), also known as Discrete Fourier Transform.
--- a/examples/c++/pedersen-commitment/README.md
+++ b/examples/c++/pedersen-commitment/README.md
@@ -1,9 +1,5 @@
 # ICICLE example: Pedersen Commitment

-## Best-Practices
-
-We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
-
 ## Key-Takeaway

 A Pedersen Commitment is a cryptographic primitive to commit to a value or a vector of values while keeping it hidden, yet enabling the committer to reveal the value later. It provides both hiding (the commitment does not reveal any information about the value) and binding properties (once a value is committed, it cannot be changed without detection).
--- a/examples/c++/polynomial-api/README.md
+++ b/examples/c++/polynomial-api/README.md
@@ -1,9 +1,5 @@
 # ICICLE examples: computations with polynomials

-## Best-Practices
-
-We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
-
 ## Key-Takeaway

 Polynomials are crucial for Zero-Knowledge Proofs (ZKPs): they enable efficient representation and verification of computational statements, facilitate privacy-preserving protocols, and support complex mathematical operations essential for constructing and verifying proofs without revealing underlying data. Polynomial API is documented [here](https://dev.ingonyama.com/icicle/polynomials/overview)
--- a/examples/c++/poseidon/README.md
+++ b/examples/c++/poseidon/README.md
@@ -1,9 +1,5 @@
 # Icicle example: build a Merkle tree using Poseidon hash

-## Best-Practices
-
-We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
-
 ## Key-Takeaway

 `Icicle` provides CUDA C++ template `poseidon_hash` to accelerate the popular [Poseidon hash function](https://www.poseidon-hash.info/).
--- a/examples/rust/msm/README.md
+++ b/examples/rust/msm/README.md
@@ -2,10 +2,6 @@

 `ICICLE` provides Rust bindings to CUDA-accelerated C++ implementation of [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).

-## Best Practices
-
-In order to save time and setting up prerequisites manually, we recommend running this example in our [ZKContainer](../../ZKContainer.md).
-
 ## Usage

 ```rust
--- a/examples/rust/ntt/README.md
+++ b/examples/rust/ntt/README.md
@@ -4,10 +4,6 @@

 `ICICLE` provides Rust bindings to CUDA-accelerated C++ implementation of [Number Theoretic Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md).

-## Best Practices
-
-In order to save time and setting up prerequisites manually, we recommend running this example in our [ZKContainer](../../ZKContainer.md).
-
 ## Usage

 ```rust
--- a/icicle/include/fields/field.cuh
+++ b/icicle/include/fields/field.cuh
@@ -124,6 +124,19 @@ public:
   */
  static constexpr HOST_DEVICE_INLINE unsigned num_of_reductions() { return CONFIG::num_of_reductions; }

+  // count number of bits of the field element without leading zeros.
+  static constexpr HOST_DEVICE_INLINE unsigned num_bits(const Field& x)
+  {
+    size_t size = sizeof(x.limbs_storage.limbs[0]) * 8;
+    unsigned ret = size * TLC;
+    for (unsigned i = TLC; i-- > 0;) {
+      int leading = __clz(x.limbs_storage.limbs[i]);
+      ret -= leading;
+      if (leading != size) { break; }
+    }
+    return ret;
+  }
+
  static constexpr unsigned slack_bits = 32 * TLC - NBITS;

  struct Wide {
--- a/icicle/include/hash/keccak/keccak.cuh
+++ b/icicle/include/hash/keccak/keccak.cuh
@@ -22,9 +22,14 @@ namespace keccak {
  // Number of state elements in u64
  const int KECCAK_STATE_SIZE = 25;

+  const int KECCAK_PADDING_CONST = 1;
+  const int SHA3_PADDING_CONST = 6;
+
  class Keccak : public Hasher<uint8_t, uint64_t>
  {
  public:
+    const int PADDING_CONST;
+
    cudaError_t run_hash_many_kernel(
      const uint8_t* input,
      uint64_t* output,
@@ -33,7 +38,34 @@ namespace keccak {
      unsigned int output_len,
      const device_context::DeviceContext& ctx) const override;

-    Keccak(unsigned int rate) : Hasher<uint8_t, uint64_t>(KECCAK_STATE_SIZE, KECCAK_STATE_SIZE, rate, 0) {}
+    Keccak(unsigned int rate, unsigned int padding_const)
+        : Hasher<uint8_t, uint64_t>(KECCAK_STATE_SIZE, KECCAK_STATE_SIZE, rate, 0), PADDING_CONST(padding_const)
+    {
+    }
+  };
+
+  class Keccak256 : public Keccak
+  {
+  public:
+    Keccak256() : Keccak(KECCAK_256_RATE, KECCAK_PADDING_CONST) {}
+  };
+
+  class Keccak512 : public Keccak
+  {
+  public:
+    Keccak512() : Keccak(KECCAK_512_RATE, KECCAK_PADDING_CONST) {}
+  };
+
+  class Sha3_256 : public Keccak
+  {
+  public:
+    Sha3_256() : Keccak(KECCAK_256_RATE, SHA3_PADDING_CONST) {}
+  };
+
+  class Sha3_512 : public Keccak
+  {
+  public:
+    Sha3_512() : Keccak(KECCAK_512_RATE, SHA3_PADDING_CONST) {}
  };
 } // namespace keccak

--- a/icicle/include/vec_ops/vec_ops.cuh
+++ b/icicle/include/vec_ops/vec_ops.cuh
@@ -27,8 +27,6 @@ namespace vec_ops {
                    *   non-blocking and you'd need to synchronize it explicitly by running
                    *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
                    *   function will block the current CPU thread. */
-    bool is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form.
-                                 * Default value: false.  */
  };

  /**
@@ -44,7 +42,6 @@ namespace vec_ops {
      false, // is_b_on_device
      false, // is_result_on_device
      false, // is_async
-      false, // is_in_montgomery_form
    };
    return config;
  }
--- a/icicle/src/curves/extern.cu
+++ b/icicle/src/curves/extern.cu
@@ -20,6 +20,11 @@ extern "C" void CONCAT_EXPAND(CURVE, to_affine)(projective_t* point, affine_t* p
  *point_out = projective_t::to_affine(*point);
 }

+extern "C" void CONCAT_EXPAND(CURVE, from_affine)(affine_t* point, projective_t* point_out)
+{
+  *point_out = projective_t::from_affine(*point);
+}
+
 extern "C" void CONCAT_EXPAND(CURVE, generate_projective_points)(projective_t* points, int size)
 {
  projective_t::rand_host_many(points, size);
--- a/icicle/src/curves/extern_g2.cu
+++ b/icicle/src/curves/extern_g2.cu
@@ -20,6 +20,11 @@ extern "C" void CONCAT_EXPAND(CURVE, g2_to_affine)(g2_projective_t* point, g2_af
  *point_out = g2_projective_t::to_affine(*point);
 }

+extern "C" void CONCAT_EXPAND(CURVE, g2_from_affine)(g2_affine_t* point, g2_projective_t* point_out)
+{
+  *point_out = g2_projective_t::from_affine(*point);
+}
+
 extern "C" void CONCAT_EXPAND(CURVE, g2_generate_projective_points)(g2_projective_t* points, int size)
 {
  g2_projective_t::rand_host_many(points, size);
--- a/icicle/src/hash/keccak/extern.cu
+++ b/icicle/src/hash/keccak/extern.cu
@@ -11,15 +11,29 @@ namespace keccak {
  extern "C" cudaError_t
  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, HashConfig& config)
  {
-    return Keccak(KECCAK_256_RATE)
-      .hash_many(input, (uint64_t*)output, number_of_blocks, input_block_size, KECCAK_256_DIGEST, config);
+    return Keccak256().hash_many(
+      input, (uint64_t*)output, number_of_blocks, input_block_size, KECCAK_256_DIGEST, config);
  }

  extern "C" cudaError_t
  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, HashConfig& config)
  {
-    return Keccak(KECCAK_512_RATE)
-      .hash_many(input, (uint64_t*)output, number_of_blocks, input_block_size, KECCAK_512_DIGEST, config);
+    return Keccak512().hash_many(
+      input, (uint64_t*)output, number_of_blocks, input_block_size, KECCAK_512_DIGEST, config);
+  }
+
+  extern "C" cudaError_t
+  sha3_256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, HashConfig& config)
+  {
+    return Sha3_256().hash_many(
+      input, (uint64_t*)output, number_of_blocks, input_block_size, KECCAK_256_DIGEST, config);
+  }
+
+  extern "C" cudaError_t
+  sha3_512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, HashConfig& config)
+  {
+    return Sha3_512().hash_many(
+      input, (uint64_t*)output, number_of_blocks, input_block_size, KECCAK_512_DIGEST, config);
  }

  extern "C" cudaError_t build_keccak256_merkle_tree_cuda(
@@ -29,7 +43,7 @@ namespace keccak {
    unsigned int input_block_len,
    const merkle_tree::TreeBuilderConfig& tree_config)
  {
-    Keccak keccak(KECCAK_256_RATE);
+    Keccak256 keccak;
    return merkle_tree::build_merkle_tree<uint8_t, uint64_t>(
      leaves, digests, height, input_block_len, keccak, keccak, tree_config);
  }
@@ -41,7 +55,31 @@ namespace keccak {
    unsigned int input_block_len,
    const merkle_tree::TreeBuilderConfig& tree_config)
  {
-    Keccak keccak(KECCAK_512_RATE);
+    Keccak512 keccak;
+    return merkle_tree::build_merkle_tree<uint8_t, uint64_t>(
+      leaves, digests, height, input_block_len, keccak, keccak, tree_config);
+  }
+
+  extern "C" cudaError_t build_sha3_256_merkle_tree_cuda(
+    const uint8_t* leaves,
+    uint64_t* digests,
+    unsigned int height,
+    unsigned int input_block_len,
+    const merkle_tree::TreeBuilderConfig& tree_config)
+  {
+    Sha3_256 keccak;
+    return merkle_tree::build_merkle_tree<uint8_t, uint64_t>(
+      leaves, digests, height, input_block_len, keccak, keccak, tree_config);
+  }
+
+  extern "C" cudaError_t build_sha3_512_merkle_tree_cuda(
+    const uint8_t* leaves,
+    uint64_t* digests,
+    unsigned int height,
+    unsigned int input_block_len,
+    const merkle_tree::TreeBuilderConfig& tree_config)
+  {
+    Sha3_512 keccak;
    return merkle_tree::build_merkle_tree<uint8_t, uint64_t>(
      leaves, digests, height, input_block_len, keccak, keccak, tree_config);
  }
--- a/icicle/src/hash/keccak/keccak.cu
+++ b/icicle/src/hash/keccak/keccak.cu
@@ -180,8 +180,13 @@ namespace keccak {
  }

  template <const int R>
-  __global__ void
-  keccak_hash_blocks(const uint8_t* input, int input_block_size, int output_len, int number_of_blocks, uint64_t* output)
+  __global__ void keccak_hash_blocks(
+    const uint8_t* input,
+    int input_block_size,
+    int output_len,
+    int number_of_blocks,
+    uint64_t* output,
+    int padding_const)
  {
    int sid = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (sid >= number_of_blocks) { return; }
@@ -209,7 +214,7 @@ namespace keccak {
    }

    // pad 10*1
-    last_block[input_len] = 1;
+    last_block[input_len] = padding_const;
    for (int i = 0; i < R - input_len - 1; i++) {
      last_block[input_len + i + 1] = 0;
    }
@@ -240,11 +245,11 @@ namespace keccak {
    switch (rate) {
    case KECCAK_256_RATE:
      keccak_hash_blocks<KECCAK_256_RATE><<<number_of_gpu_blocks, number_of_threads, 0, ctx.stream>>>(
-        input, input_len, output_len, number_of_states, output);
+        input, input_len, output_len, number_of_states, output, PADDING_CONST);
      break;
    case KECCAK_512_RATE:
      keccak_hash_blocks<KECCAK_512_RATE><<<number_of_gpu_blocks, number_of_threads, 0, ctx.stream>>>(
-        input, input_len, output_len, number_of_states, output);
+        input, input_len, output_len, number_of_states, output, PADDING_CONST);
      break;
    default:
      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "KeccakHash: #rate must be one of [136, 72]");
--- a/icicle/src/merkle-tree/merkle.cu
+++ b/icicle/src/merkle-tree/merkle.cu
@@ -129,8 +129,9 @@ namespace merkle_tree {

    while (number_of_states > 0) {
      CHK_IF_RETURN(compression.run_hash_many_kernel(
-        (L*)prev_layer, next_layer, number_of_states, tree_config.digest_elements * tree_config.arity,
-        tree_config.digest_elements, hash_config.ctx));
+        (L*)prev_layer, next_layer, number_of_states,
+        tree_config.digest_elements * tree_config.arity * (sizeof(D) / sizeof(L)), tree_config.digest_elements,
+        hash_config.ctx));

      if (!keep_rows || subtree_height < keep_rows) {
        D* digests_with_offset =
@@ -298,8 +299,9 @@ namespace merkle_tree {
      size_t segment_offset = start_segment_offset;
      while (number_of_states > 0) {
        CHK_IF_RETURN(compression.run_hash_many_kernel(
-          (L*)prev_layer, next_layer, number_of_states, tree_config.digest_elements * tree_config.arity,
-          tree_config.digest_elements, tree_config.ctx));
+          (L*)prev_layer, next_layer, number_of_states,
+          tree_config.digest_elements * tree_config.arity * (sizeof(D) / sizeof(L)), tree_config.digest_elements,
+          tree_config.ctx));
        if (!tree_config.keep_rows || cap_height < tree_config.keep_rows + (int)caps_mode) {
          D* digests_with_offset = digests + segment_offset;
          CHK_IF_RETURN(cudaMemcpyAsync(
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -95,64 +95,25 @@ namespace vec_ops {
    E *d_result, *d_alloc_vec_a, *d_alloc_vec_b;
    E* d_vec_a;
    const E* d_vec_b;
-
-    int is_d_alloc_vec_a_allocated = 0;
    if (!config.is_a_on_device) {
-      if (config.is_in_montgomery_form) {
-        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
-        CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
-        CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a));
-        is_d_alloc_vec_a_allocated = 1;
-        d_vec_a = d_alloc_vec_a;
-      } else {
-        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
-        CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
-        is_d_alloc_vec_a_allocated = 1;
-        d_vec_a = d_alloc_vec_a;
-      }
+      CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
+      CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
+      d_vec_a = d_alloc_vec_a;
    } else {
-      if (config.is_in_montgomery_form) {
-        CHK_IF_RETURN(cudaMallocAsync(
-          &d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input.
-        CHK_IF_RETURN(mont::from_montgomery(vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a));
-        is_d_alloc_vec_a_allocated = 1;
-        d_vec_a = d_alloc_vec_a;
-      } else {
-        d_vec_a = vec_a;
-      }
+      d_vec_a = vec_a;
    }

-    int is_d_alloc_vec_b_allocated = 0;
    if (!config.is_b_on_device) {
-      if (config.is_in_montgomery_form) {
-        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
-        CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
-        CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b));
-        is_d_alloc_vec_b_allocated = 1;
-        d_vec_b = d_alloc_vec_b;
-      } else {
-        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
-        CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
-        is_d_alloc_vec_b_allocated = 1;
-        d_vec_b = d_alloc_vec_b;
-      }
+      CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
+      CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
+      d_vec_b = d_alloc_vec_b;
    } else {
-      if (config.is_in_montgomery_form) {
-        CHK_IF_RETURN(cudaMallocAsync(
-          &d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input.
-        CHK_IF_RETURN(mont::from_montgomery(vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b));
-        is_d_alloc_vec_b_allocated = 1;
-        d_vec_b = d_alloc_vec_b;
-      } else {
-        d_vec_b = vec_b;
-      }
+      d_vec_b = vec_b;
    }

-    int is_d_result_allocated = 0;
    if (!config.is_result_on_device) {
      if (!is_in_place) {
        CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
-        is_d_result_allocated = 1;
      } else {
        d_result = d_vec_a;
      }
@@ -168,21 +129,12 @@ namespace vec_ops {
    Kernel<<<num_blocks, num_threads, 0, config.ctx.stream>>>(d_vec_a, d_vec_b, n, d_result);

    if (!config.is_result_on_device) {
-      if (config.is_in_montgomery_form) {
-        CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place.
-        CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
-      } else {
-        CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
-      }
-    } else {
-      if (config.is_in_montgomery_form) {
-        CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place.
-      }
+      CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
+      CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream));
    }

-    if (is_d_alloc_vec_a_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
-    if (is_d_alloc_vec_b_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
-    if (is_d_result_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream)); }
+    if (!config.is_a_on_device && !is_in_place) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
+    if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }

    if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream));

--- a/wrappers/golang/core/internal/mock_curve.go
+++ b/wrappers/golang/core/internal/mock_curve.go
@@ -28,21 +28,6 @@ func (p *MockProjective) FromLimbs(x, y, z []uint32) MockProjective {
 	return *p
 }

-func (p *MockProjective) FromAffine(a MockAffine) MockProjective {
-	z := MockBaseField{}
-	z.One()
-
-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		p.Zero()
-	} else {
-		p.X = a.X
-		p.Y = a.Y
-		p.Z = z.One()
-	}
-
-	return *p
-}
-
 type MockAffine struct {
 	X, Y MockBaseField
 }
@@ -68,18 +53,3 @@ func (a *MockAffine) FromLimbs(x, y []uint32) MockAffine {

 	return *a
 }
-
-func (a MockAffine) ToProjective() MockProjective {
-	var z MockBaseField
-
-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		var p MockProjective
-		return p.Zero()
-	}
-
-	return MockProjective{
-		X: a.X,
-		Y: a.Y,
-		Z: z.One(),
-	}
-}
--- a/wrappers/golang/core/vec_ops.go
+++ b/wrappers/golang/core/vec_ops.go
@@ -28,8 +28,6 @@ type VecOpsConfig struct {
 	*  non-blocking and you'll need to synchronize it explicitly by calling
 	*  `SynchronizeStream`. If set to false, the function will block the current CPU thread. */
 	IsAsync bool
-	/* If true then vec_a, vec_b and result are in montgomery form. Default value: false. */
-	IsInMontgomeryForm bool
 }

 /**
@@ -44,7 +42,6 @@ func DefaultVecOpsConfig() VecOpsConfig {
 		false, // isBOnDevice
 		false, // isResultOnDevice
 		false, // IsAsync
-		false, // IsInMontgomeryForm
 	}

 	return config
--- a/wrappers/golang/core/vec_ops_test.go
+++ b/wrappers/golang/core/vec_ops_test.go
@@ -15,7 +15,6 @@ func TestVecOpsDefaultConfig(t *testing.T) {
 		false, // isBOnDevice
 		false, // isResultOnDevice
 		false, // IsAsync
-		false, // IsInMontgomeryForm
 	}

 	actual := DefaultVecOpsConfig()
--- a/wrappers/golang/curves/bls12377/curve.go
+++ b/wrappers/golang/curves/bls12377/curve.go
@@ -40,17 +40,10 @@ func (p *Projective) FromLimbs(x, y, z []uint32) Projective {
 }

 func (p *Projective) FromAffine(a Affine) Projective {
-	z := BaseField{}
-	z.One()
-
-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		p.Zero()
-	} else {
-		p.X = a.X
-		p.Y = a.Y
-		p.Z = z.One()
-	}

+	cA := (*C.affine_t)(unsafe.Pointer(&a))
+	cP := (*C.projective_t)(unsafe.Pointer(p))
+	C.bls12_377_from_affine(cA, cP)
 	return *p
 }

@@ -65,7 +58,7 @@ func (p *Projective) ProjectiveToAffine() Affine {
 	var a Affine

 	cA := (*C.affine_t)(unsafe.Pointer(&a))
-	cP := (*C.projective_t)(unsafe.Pointer(&p))
+	cP := (*C.projective_t)(unsafe.Pointer(p))
 	C.bls12_377_to_affine(cP, cA)
 	return a
 }
@@ -111,18 +104,12 @@ func (a *Affine) FromLimbs(x, y []uint32) Affine {
 }

 func (a Affine) ToProjective() Projective {
-	var z BaseField
+	var p Projective

-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		var p Projective
-		return p.Zero()
-	}
-
-	return Projective{
-		X: a.X,
-		Y: a.Y,
-		Z: z.One(),
-	}
+	cA := (*C.affine_t)(unsafe.Pointer(&a))
+	cP := (*C.projective_t)(unsafe.Pointer(&p))
+	C.bls12_377_from_affine(cA, cP)
+	return p
 }

 func AffineFromProjective(p *Projective) Affine {
--- a/wrappers/golang/curves/bls12377/g2/curve.go
+++ b/wrappers/golang/curves/bls12377/g2/curve.go
@@ -40,17 +40,10 @@ func (p *G2Projective) FromLimbs(x, y, z []uint32) G2Projective {
 }

 func (p *G2Projective) FromAffine(a G2Affine) G2Projective {
-	z := G2BaseField{}
-	z.One()
-
-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		p.Zero()
-	} else {
-		p.X = a.X
-		p.Y = a.Y
-		p.Z = z.One()
-	}

+	cA := (*C.g2_affine_t)(unsafe.Pointer(&a))
+	cP := (*C.g2_projective_t)(unsafe.Pointer(p))
+	C.bls12_377_g2_from_affine(cA, cP)
 	return *p
 }

@@ -65,7 +58,7 @@ func (p *G2Projective) ProjectiveToAffine() G2Affine {
 	var a G2Affine

 	cA := (*C.g2_affine_t)(unsafe.Pointer(&a))
-	cP := (*C.g2_projective_t)(unsafe.Pointer(&p))
+	cP := (*C.g2_projective_t)(unsafe.Pointer(p))
 	C.bls12_377_g2_to_affine(cP, cA)
 	return a
 }
@@ -111,18 +104,12 @@ func (a *G2Affine) FromLimbs(x, y []uint32) G2Affine {
 }

 func (a G2Affine) ToProjective() G2Projective {
-	var z G2BaseField
+	var p G2Projective

-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		var p G2Projective
-		return p.Zero()
-	}
-
-	return G2Projective{
-		X: a.X,
-		Y: a.Y,
-		Z: z.One(),
-	}
+	cA := (*C.g2_affine_t)(unsafe.Pointer(&a))
+	cP := (*C.g2_projective_t)(unsafe.Pointer(&p))
+	C.bls12_377_g2_from_affine(cA, cP)
+	return p
 }

 func G2AffineFromProjective(p *G2Projective) G2Affine {
--- a/wrappers/golang/curves/bls12377/g2/include/curve.h
+++ b/wrappers/golang/curves/bls12377/g2/include/curve.h
@@ -14,6 +14,7 @@ typedef struct DeviceContext DeviceContext;

 bool bls12_377_g2_eq(g2_projective_t* point1, g2_projective_t* point2);
 void bls12_377_g2_to_affine(g2_projective_t* point, g2_affine_t* point_out);
+void bls12_377_g2_from_affine(g2_affine_t* point, g2_projective_t* point_out);
 void bls12_377_g2_generate_projective_points(g2_projective_t* points, int size);
 void bls12_377_g2_generate_affine_points(g2_affine_t* points, int size);
 cudaError_t bls12_377_g2_affine_convert_montgomery(g2_affine_t* points, size_t n, bool is_into, DeviceContext* ctx);
--- a/wrappers/golang/curves/bls12377/include/curve.h
+++ b/wrappers/golang/curves/bls12377/include/curve.h
@@ -14,6 +14,7 @@ typedef struct DeviceContext DeviceContext;

 bool bls12_377_eq(projective_t* point1, projective_t* point2);
 void bls12_377_to_affine(projective_t* point, affine_t* point_out);
+void bls12_377_from_affine(affine_t* point, projective_t* point_out);
 void bls12_377_generate_projective_points(projective_t* points, int size);
 void bls12_377_generate_affine_points(affine_t* points, int size);
 cudaError_t bls12_377_affine_convert_montgomery(affine_t* points, size_t n, bool is_into, DeviceContext* ctx);
--- a/wrappers/golang/curves/bls12381/curve.go
+++ b/wrappers/golang/curves/bls12381/curve.go
@@ -40,17 +40,10 @@ func (p *Projective) FromLimbs(x, y, z []uint32) Projective {
 }

 func (p *Projective) FromAffine(a Affine) Projective {
-	z := BaseField{}
-	z.One()
-
-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		p.Zero()
-	} else {
-		p.X = a.X
-		p.Y = a.Y
-		p.Z = z.One()
-	}

+	cA := (*C.affine_t)(unsafe.Pointer(&a))
+	cP := (*C.projective_t)(unsafe.Pointer(p))
+	C.bls12_381_from_affine(cA, cP)
 	return *p
 }

@@ -65,7 +58,7 @@ func (p *Projective) ProjectiveToAffine() Affine {
 	var a Affine

 	cA := (*C.affine_t)(unsafe.Pointer(&a))
-	cP := (*C.projective_t)(unsafe.Pointer(&p))
+	cP := (*C.projective_t)(unsafe.Pointer(p))
 	C.bls12_381_to_affine(cP, cA)
 	return a
 }
@@ -111,18 +104,12 @@ func (a *Affine) FromLimbs(x, y []uint32) Affine {
 }

 func (a Affine) ToProjective() Projective {
-	var z BaseField
+	var p Projective

-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		var p Projective
-		return p.Zero()
-	}
-
-	return Projective{
-		X: a.X,
-		Y: a.Y,
-		Z: z.One(),
-	}
+	cA := (*C.affine_t)(unsafe.Pointer(&a))
+	cP := (*C.projective_t)(unsafe.Pointer(&p))
+	C.bls12_381_from_affine(cA, cP)
+	return p
 }

 func AffineFromProjective(p *Projective) Affine {
--- a/wrappers/golang/curves/bls12381/g2/curve.go
+++ b/wrappers/golang/curves/bls12381/g2/curve.go
@@ -40,17 +40,10 @@ func (p *G2Projective) FromLimbs(x, y, z []uint32) G2Projective {
 }

 func (p *G2Projective) FromAffine(a G2Affine) G2Projective {
-	z := G2BaseField{}
-	z.One()
-
-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		p.Zero()
-	} else {
-		p.X = a.X
-		p.Y = a.Y
-		p.Z = z.One()
-	}

+	cA := (*C.g2_affine_t)(unsafe.Pointer(&a))
+	cP := (*C.g2_projective_t)(unsafe.Pointer(p))
+	C.bls12_381_g2_from_affine(cA, cP)
 	return *p
 }

@@ -65,7 +58,7 @@ func (p *G2Projective) ProjectiveToAffine() G2Affine {
 	var a G2Affine

 	cA := (*C.g2_affine_t)(unsafe.Pointer(&a))
-	cP := (*C.g2_projective_t)(unsafe.Pointer(&p))
+	cP := (*C.g2_projective_t)(unsafe.Pointer(p))
 	C.bls12_381_g2_to_affine(cP, cA)
 	return a
 }
@@ -111,18 +104,13 @@ func (a *G2Affine) FromLimbs(x, y []uint32) G2Affine {
 }

 func (a G2Affine) ToProjective() G2Projective {
-	var z G2BaseField
+	var p G2Projective

-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		var p G2Projective
-		return p.Zero()
-	}
+	cA := (*C.g2_affine_t)(unsafe.Pointer(&a))
+	cP := (*C.g2_projective_t)(unsafe.Pointer(&p))
+	C.bls12_381_g2_from_affine(cA, cP)
+	return p

-	return G2Projective{
-		X: a.X,
-		Y: a.Y,
-		Z: z.One(),
-	}
 }

 func G2AffineFromProjective(p *G2Projective) G2Affine {
--- a/wrappers/golang/curves/bls12381/g2/include/curve.h
+++ b/wrappers/golang/curves/bls12381/g2/include/curve.h
@@ -14,6 +14,7 @@ typedef struct DeviceContext DeviceContext;

 bool bls12_381_g2_eq(g2_projective_t* point1, g2_projective_t* point2);
 void bls12_381_g2_to_affine(g2_projective_t* point, g2_affine_t* point_out);
+void bls12_381_g2_from_affine(g2_affine_t* point, g2_projective_t* point_out);
 void bls12_381_g2_generate_projective_points(g2_projective_t* points, int size);
 void bls12_381_g2_generate_affine_points(g2_affine_t* points, int size);
 cudaError_t bls12_381_g2_affine_convert_montgomery(g2_affine_t* points, size_t n, bool is_into, DeviceContext* ctx);
--- a/wrappers/golang/curves/bls12381/include/curve.h
+++ b/wrappers/golang/curves/bls12381/include/curve.h
@@ -14,6 +14,7 @@ typedef struct DeviceContext DeviceContext;

 bool bls12_381_eq(projective_t* point1, projective_t* point2);
 void bls12_381_to_affine(projective_t* point, affine_t* point_out);
+void bls12_381_from_affine(affine_t* point, projective_t* point_out);
 void bls12_381_generate_projective_points(projective_t* points, int size);
 void bls12_381_generate_affine_points(affine_t* points, int size);
 cudaError_t bls12_381_affine_convert_montgomery(affine_t* points, size_t n, bool is_into, DeviceContext* ctx);
--- a/wrappers/golang/curves/bn254/curve.go
+++ b/wrappers/golang/curves/bn254/curve.go
@@ -40,17 +40,10 @@ func (p *Projective) FromLimbs(x, y, z []uint32) Projective {
 }

 func (p *Projective) FromAffine(a Affine) Projective {
-	z := BaseField{}
-	z.One()
-
-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		p.Zero()
-	} else {
-		p.X = a.X
-		p.Y = a.Y
-		p.Z = z.One()
-	}

+	cA := (*C.affine_t)(unsafe.Pointer(&a))
+	cP := (*C.projective_t)(unsafe.Pointer(p))
+	C.bn254_from_affine(cA, cP)
 	return *p
 }

@@ -65,7 +58,7 @@ func (p *Projective) ProjectiveToAffine() Affine {
 	var a Affine

 	cA := (*C.affine_t)(unsafe.Pointer(&a))
-	cP := (*C.projective_t)(unsafe.Pointer(&p))
+	cP := (*C.projective_t)(unsafe.Pointer(p))
 	C.bn254_to_affine(cP, cA)
 	return a
 }
@@ -111,18 +104,13 @@ func (a *Affine) FromLimbs(x, y []uint32) Affine {
 }

 func (a Affine) ToProjective() Projective {
-	var z BaseField
+	var p Projective

-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		var p Projective
-		return p.Zero()
-	}
+	cA := (*C.affine_t)(unsafe.Pointer(&a))
+	cP := (*C.projective_t)(unsafe.Pointer(&p))
+	C.bn254_from_affine(cA, cP)
+	return p

-	return Projective{
-		X: a.X,
-		Y: a.Y,
-		Z: z.One(),
-	}
 }

 func AffineFromProjective(p *Projective) Affine {
--- a/wrappers/golang/curves/bn254/g2/curve.go
+++ b/wrappers/golang/curves/bn254/g2/curve.go
@@ -40,17 +40,10 @@ func (p *G2Projective) FromLimbs(x, y, z []uint32) G2Projective {
 }

 func (p *G2Projective) FromAffine(a G2Affine) G2Projective {
-	z := G2BaseField{}
-	z.One()
-
-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		p.Zero()
-	} else {
-		p.X = a.X
-		p.Y = a.Y
-		p.Z = z.One()
-	}

+	cA := (*C.g2_affine_t)(unsafe.Pointer(&a))
+	cP := (*C.g2_projective_t)(unsafe.Pointer(p))
+	C.bn254_g2_from_affine(cA, cP)
 	return *p
 }

@@ -65,7 +58,7 @@ func (p *G2Projective) ProjectiveToAffine() G2Affine {
 	var a G2Affine

 	cA := (*C.g2_affine_t)(unsafe.Pointer(&a))
-	cP := (*C.g2_projective_t)(unsafe.Pointer(&p))
+	cP := (*C.g2_projective_t)(unsafe.Pointer(p))
 	C.bn254_g2_to_affine(cP, cA)
 	return a
 }
@@ -111,18 +104,12 @@ func (a *G2Affine) FromLimbs(x, y []uint32) G2Affine {
 }

 func (a G2Affine) ToProjective() G2Projective {
-	var z G2BaseField
+	var p G2Projective

-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		var p G2Projective
-		return p.Zero()
-	}
-
-	return G2Projective{
-		X: a.X,
-		Y: a.Y,
-		Z: z.One(),
-	}
+	cA := (*C.g2_affine_t)(unsafe.Pointer(&a))
+	cP := (*C.g2_projective_t)(unsafe.Pointer(&p))
+	C.bn254_g2_from_affine(cA, cP)
+	return p
 }

 func G2AffineFromProjective(p *G2Projective) G2Affine {
--- a/wrappers/golang/curves/bn254/g2/include/curve.h
+++ b/wrappers/golang/curves/bn254/g2/include/curve.h
@@ -14,6 +14,7 @@ typedef struct DeviceContext DeviceContext;

 bool bn254_g2_eq(g2_projective_t* point1, g2_projective_t* point2);
 void bn254_g2_to_affine(g2_projective_t* point, g2_affine_t* point_out);
+void bn254_g2_from_affine(g2_affine_t* point, g2_projective_t* point_out);
 void bn254_g2_generate_projective_points(g2_projective_t* points, int size);
 void bn254_g2_generate_affine_points(g2_affine_t* points, int size);
 cudaError_t bn254_g2_affine_convert_montgomery(g2_affine_t* points, size_t n, bool is_into, DeviceContext* ctx);
--- a/wrappers/golang/curves/bn254/include/curve.h
+++ b/wrappers/golang/curves/bn254/include/curve.h
@@ -14,6 +14,7 @@ typedef struct DeviceContext DeviceContext;

 bool bn254_eq(projective_t* point1, projective_t* point2);
 void bn254_to_affine(projective_t* point, affine_t* point_out);
+void bn254_from_affine(affine_t* point, projective_t* point_out);
 void bn254_generate_projective_points(projective_t* points, int size);
 void bn254_generate_affine_points(affine_t* points, int size);
 cudaError_t bn254_affine_convert_montgomery(affine_t* points, size_t n, bool is_into, DeviceContext* ctx);
--- a/wrappers/golang/curves/bw6761/curve.go
+++ b/wrappers/golang/curves/bw6761/curve.go
@@ -40,17 +40,10 @@ func (p *Projective) FromLimbs(x, y, z []uint32) Projective {
 }

 func (p *Projective) FromAffine(a Affine) Projective {
-	z := BaseField{}
-	z.One()
-
-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		p.Zero()
-	} else {
-		p.X = a.X
-		p.Y = a.Y
-		p.Z = z.One()
-	}

+	cA := (*C.affine_t)(unsafe.Pointer(&a))
+	cP := (*C.projective_t)(unsafe.Pointer(p))
+	C.bw6_761_from_affine(cA, cP)
 	return *p
 }

@@ -65,7 +58,7 @@ func (p *Projective) ProjectiveToAffine() Affine {
 	var a Affine

 	cA := (*C.affine_t)(unsafe.Pointer(&a))
-	cP := (*C.projective_t)(unsafe.Pointer(&p))
+	cP := (*C.projective_t)(unsafe.Pointer(p))
 	C.bw6_761_to_affine(cP, cA)
 	return a
 }
@@ -111,18 +104,13 @@ func (a *Affine) FromLimbs(x, y []uint32) Affine {
 }

 func (a Affine) ToProjective() Projective {
-	var z BaseField
+	var p Projective

-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		var p Projective
-		return p.Zero()
-	}
+	cA := (*C.affine_t)(unsafe.Pointer(&a))
+	cP := (*C.projective_t)(unsafe.Pointer(&p))
+	C.bw6_761_from_affine(cA, cP)
+	return p

-	return Projective{
-		X: a.X,
-		Y: a.Y,
-		Z: z.One(),
-	}
 }

 func AffineFromProjective(p *Projective) Affine {
--- a/wrappers/golang/curves/bw6761/g2/curve.go
+++ b/wrappers/golang/curves/bw6761/g2/curve.go
@@ -40,17 +40,10 @@ func (p *G2Projective) FromLimbs(x, y, z []uint32) G2Projective {
 }

 func (p *G2Projective) FromAffine(a G2Affine) G2Projective {
-	z := G2BaseField{}
-	z.One()
-
-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		p.Zero()
-	} else {
-		p.X = a.X
-		p.Y = a.Y
-		p.Z = z.One()
-	}

+	cA := (*C.g2_affine_t)(unsafe.Pointer(&a))
+	cP := (*C.g2_projective_t)(unsafe.Pointer(p))
+	C.bw6_761_g2_from_affine(cA, cP)
 	return *p
 }

@@ -65,7 +58,7 @@ func (p *G2Projective) ProjectiveToAffine() G2Affine {
 	var a G2Affine

 	cA := (*C.g2_affine_t)(unsafe.Pointer(&a))
-	cP := (*C.g2_projective_t)(unsafe.Pointer(&p))
+	cP := (*C.g2_projective_t)(unsafe.Pointer(p))
 	C.bw6_761_g2_to_affine(cP, cA)
 	return a
 }
@@ -111,18 +104,12 @@ func (a *G2Affine) FromLimbs(x, y []uint32) G2Affine {
 }

 func (a G2Affine) ToProjective() G2Projective {
-	var z G2BaseField
+	var p G2Projective

-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		var p G2Projective
-		return p.Zero()
-	}
-
-	return G2Projective{
-		X: a.X,
-		Y: a.Y,
-		Z: z.One(),
-	}
+	cA := (*C.g2_affine_t)(unsafe.Pointer(&a))
+	cP := (*C.g2_projective_t)(unsafe.Pointer(&p))
+	C.bw6_761_g2_from_affine(cA, cP)
+	return p
 }

 func G2AffineFromProjective(p *G2Projective) G2Affine {
--- a/wrappers/golang/curves/bw6761/g2/include/curve.h
+++ b/wrappers/golang/curves/bw6761/g2/include/curve.h
@@ -14,6 +14,7 @@ typedef struct DeviceContext DeviceContext;

 bool bw6_761_g2_eq(g2_projective_t* point1, g2_projective_t* point2);
 void bw6_761_g2_to_affine(g2_projective_t* point, g2_affine_t* point_out);
+void bw6_761_g2_from_affine(g2_affine_t* point, g2_projective_t* point_out);
 void bw6_761_g2_generate_projective_points(g2_projective_t* points, int size);
 void bw6_761_g2_generate_affine_points(g2_affine_t* points, int size);
 cudaError_t bw6_761_g2_affine_convert_montgomery(g2_affine_t* points, size_t n, bool is_into, DeviceContext* ctx);
--- a/wrappers/golang/curves/bw6761/include/curve.h
+++ b/wrappers/golang/curves/bw6761/include/curve.h
@@ -14,6 +14,7 @@ typedef struct DeviceContext DeviceContext;

 bool bw6_761_eq(projective_t* point1, projective_t* point2);
 void bw6_761_to_affine(projective_t* point, affine_t* point_out);
+void bw6_761_from_affine(affine_t* point, projective_t* point_out);
 void bw6_761_generate_projective_points(projective_t* points, int size);
 void bw6_761_generate_affine_points(affine_t* points, int size);
 cudaError_t bw6_761_affine_convert_montgomery(affine_t* points, size_t n, bool is_into, DeviceContext* ctx);
--- a/wrappers/golang/curves/grumpkin/curve.go
+++ b/wrappers/golang/curves/grumpkin/curve.go
@@ -40,17 +40,10 @@ func (p *Projective) FromLimbs(x, y, z []uint32) Projective {
 }

 func (p *Projective) FromAffine(a Affine) Projective {
-	z := BaseField{}
-	z.One()
-
-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		p.Zero()
-	} else {
-		p.X = a.X
-		p.Y = a.Y
-		p.Z = z.One()
-	}

+	cA := (*C.affine_t)(unsafe.Pointer(&a))
+	cP := (*C.projective_t)(unsafe.Pointer(p))
+	C.grumpkin_from_affine(cA, cP)
 	return *p
 }

@@ -65,7 +58,7 @@ func (p *Projective) ProjectiveToAffine() Affine {
 	var a Affine

 	cA := (*C.affine_t)(unsafe.Pointer(&a))
-	cP := (*C.projective_t)(unsafe.Pointer(&p))
+	cP := (*C.projective_t)(unsafe.Pointer(p))
 	C.grumpkin_to_affine(cP, cA)
 	return a
 }
@@ -111,18 +104,13 @@ func (a *Affine) FromLimbs(x, y []uint32) Affine {
 }

 func (a Affine) ToProjective() Projective {
-	var z BaseField
+	var p Projective

-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		var p Projective
-		return p.Zero()
-	}
+	cA := (*C.affine_t)(unsafe.Pointer(&a))
+	cP := (*C.projective_t)(unsafe.Pointer(&p))
+	C.grumpkin_from_affine(cA, cP)
+	return p

-	return Projective{
-		X: a.X,
-		Y: a.Y,
-		Z: z.One(),
-	}
 }

 func AffineFromProjective(p *Projective) Affine {
--- a/wrappers/golang/curves/grumpkin/include/curve.h
+++ b/wrappers/golang/curves/grumpkin/include/curve.h
@@ -14,6 +14,7 @@ typedef struct DeviceContext DeviceContext;

 bool grumpkin_eq(projective_t* point1, projective_t* point2);
 void grumpkin_to_affine(projective_t* point, affine_t* point_out);
+void grumpkin_from_affine(affine_t* point, projective_t* point_out);
 void grumpkin_generate_projective_points(projective_t* points, int size);
 void grumpkin_generate_affine_points(affine_t* points, int size);
 cudaError_t grumpkin_affine_convert_montgomery(affine_t* points, size_t n, bool is_into, DeviceContext* ctx);
--- a/wrappers/golang/internal/generator/curves/templates/curve.go.tmpl
+++ b/wrappers/golang/internal/generator/curves/templates/curve.go.tmpl
@@ -39,21 +39,17 @@ func (p *{{.CurvePrefix}}Projective) FromLimbs(x, y, z []uint32) {{.CurvePrefix}
 	return *p
 }

+
+
+{{if ne .CurvePrefix "Mock"}}
 func (p *{{.CurvePrefix}}Projective) FromAffine(a {{.CurvePrefix}}Affine) {{.CurvePrefix}}Projective {
-	z := {{.CurvePrefix}}BaseField{}
-	z.One()
-
-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-		p.Zero()
-	}else{
-	p.X = a.X
-	p.Y = a.Y
-	p.Z = z.One()
-	}
-
+	
+	cA := (*C.{{toCName .CurvePrefix}}affine_t)(unsafe.Pointer(&a))
+	cP := (*C.{{toCName .CurvePrefix}}projective_t)(unsafe.Pointer(p))
+	C.{{.Curve}}{{toCNameBackwards .CurvePrefix}}_from_affine(cA, cP)
 	return *p
 }
-{{if ne .CurvePrefix "Mock"}}
+
 func (p {{.CurvePrefix}}Projective) ProjectiveEq(p2 *{{.CurvePrefix}}Projective) bool {
 	cP := (*C.{{toCName .CurvePrefix}}projective_t)(unsafe.Pointer(&p))
 	cP2 := (*C.{{toCName .CurvePrefix}}projective_t)(unsafe.Pointer(&p2))
@@ -65,7 +61,7 @@ func (p *{{.CurvePrefix}}Projective) ProjectiveToAffine() {{.CurvePrefix}}Affine
 	var a {{.CurvePrefix}}Affine

 	cA := (*C.{{toCName .CurvePrefix}}affine_t)(unsafe.Pointer(&a))
-	cP := (*C.{{toCName .CurvePrefix}}projective_t)(unsafe.Pointer(&p))
+	cP := (*C.{{toCName .CurvePrefix}}projective_t)(unsafe.Pointer(p))
 	C.{{.Curve}}{{toCNameBackwards .CurvePrefix}}_to_affine(cP, cA)
 	return a
 }
@@ -110,21 +106,17 @@ func (a *{{.CurvePrefix}}Affine) FromLimbs(x, y []uint32) {{.CurvePrefix}}Affine
 	return *a
 }

-func (a {{.CurvePrefix}}Affine) ToProjective() {{.CurvePrefix}}Projective {
-	var z {{.CurvePrefix}}BaseField

-	if (a.X == z.Zero()) && (a.Y == z.Zero()) {
-        var p {{.CurvePrefix}}Projective
-		return p.Zero()
-	}
-
-	return {{.CurvePrefix}}Projective{
-		X: a.X,
-		Y: a.Y,
-		Z: z.One(),
-	}
-}
 {{if ne .CurvePrefix "Mock"}}
+func (a {{.CurvePrefix}}Affine) ToProjective() {{.CurvePrefix}}Projective {
+	var p {{.CurvePrefix}}Projective
+
+	cA := (*C.{{toCName .CurvePrefix}}affine_t)(unsafe.Pointer(&a))
+	cP := (*C.{{toCName .CurvePrefix}}projective_t)(unsafe.Pointer(&p))
+	C.{{.Curve}}{{toCNameBackwards .CurvePrefix}}_from_affine(cA, cP)
+	return p
+}
+
 func {{.CurvePrefix}}AffineFromProjective(p *{{.CurvePrefix}}Projective) {{.CurvePrefix}}Affine {
 	return p.ProjectiveToAffine()
 }
--- a/wrappers/golang/internal/generator/curves/templates/curve.h.tmpl
+++ b/wrappers/golang/internal/generator/curves/templates/curve.h.tmpl
@@ -14,6 +14,7 @@ typedef struct DeviceContext DeviceContext;

 bool {{.Curve}}{{toCNameBackwards .CurvePrefix}}_eq({{toCName .CurvePrefix}}projective_t* point1, {{toCName .CurvePrefix}}projective_t* point2);
 void {{.Curve}}{{toCNameBackwards .CurvePrefix}}_to_affine({{toCName .CurvePrefix}}projective_t* point, {{toCName .CurvePrefix}}affine_t* point_out);
+void {{.Curve}}{{toCNameBackwards .CurvePrefix}}_from_affine({{toCName .CurvePrefix}}affine_t* point, {{toCName .CurvePrefix}}projective_t* point_out);
 void {{.Curve}}{{toCNameBackwards .CurvePrefix}}_generate_projective_points({{toCName .CurvePrefix}}projective_t* points, int size);
 void {{.Curve}}{{toCNameBackwards .CurvePrefix}}_generate_affine_points({{toCName .CurvePrefix}}affine_t* points, int size);
 cudaError_t {{.Curve}}{{toCNameBackwards .CurvePrefix}}_affine_convert_montgomery({{toCName .CurvePrefix}}affine_t* points, size_t n, bool is_into, DeviceContext* ctx);
--- a/wrappers/rust/icicle-core/src/curve.rs
+++ b/wrappers/rust/icicle-core/src/curve.rs
@@ -22,6 +22,8 @@ pub trait Curve: Debug + PartialEq + Copy + Clone {
    #[doc(hidden)]
    fn to_affine(point: *const Projective<Self>, point_aff: *mut Affine<Self>);
    #[doc(hidden)]
+    fn from_affine(point: *const Affine<Self>, point_proj: *mut Projective<Self>);
+    #[doc(hidden)]
    fn generate_random_projective_points(size: usize) -> Vec<Projective<Self>>;
    #[doc(hidden)]
    fn generate_random_affine_points(size: usize) -> Vec<Affine<Self>>;
@@ -79,27 +81,17 @@ impl<C: Curve> Affine<C> {
    }

    pub fn to_projective(&self) -> Projective<C> {
-        if *self == Self::zero() {
-            return Projective::<C>::zero();
-        }
-        Projective {
-            x: self.x,
-            y: self.y,
-            z: C::BaseField::one(),
-        }
+        let mut proj = Projective::<C>::zero();
+        C::from_affine(self as *const Self, &mut proj as *mut Projective<C>);
+        proj
    }
 }

 impl<C: Curve> From<Affine<C>> for Projective<C> {
    fn from(item: Affine<C>) -> Self {
-        if item == (Affine::<C>::zero()) {
-            return Self::zero();
-        }
-        Self {
-            x: item.x,
-            y: item.y,
-            z: C::BaseField::one(),
-        }
+        let mut proj = Self::zero();
+        C::from_affine(&item as *const Affine<C>, &mut proj as *mut Self);
+        proj
    }
 }

@@ -282,6 +274,8 @@ macro_rules! impl_curve {
                pub(crate) fn eq(point1: *const $projective_type, point2: *const $projective_type) -> bool;
                #[link_name = concat!($curve_prefix, "_to_affine")]
                pub(crate) fn proj_to_affine(point: *const $projective_type, point_out: *mut $affine_type);
+                #[link_name = concat!($curve_prefix, "_from_affine")]
+                pub(crate) fn proj_from_affine(point: *const $affine_type, point_out: *mut $projective_type);
                #[link_name = concat!($curve_prefix, "_generate_projective_points")]
                pub(crate) fn generate_projective_points(points: *mut $projective_type, size: usize);
                #[link_name = concat!($curve_prefix, "_generate_affine_points")]
@@ -315,6 +309,10 @@ macro_rules! impl_curve {
                unsafe { $curve_prefix_ident::proj_to_affine(point, point_out) };
            }

+            fn from_affine(point: *const $affine_type, point_out: *mut $projective_type) {
+                unsafe { $curve_prefix_ident::proj_from_affine(point, point_out) };
+            }
+
            fn generate_random_projective_points(size: usize) -> Vec<$projective_type> {
                let mut res = vec![$projective_type::zero(); size];
                unsafe {
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -20,8 +20,6 @@ pub struct VecOpsConfig<'a> {
    /// Whether to run the vector operations asynchronously. If set to `true`, the functions will be non-blocking and you'd need to synchronize
    /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread.
    pub is_async: bool,
-    /// If true then vec_a, vec_b and result are in montgomery form. Default value: false.
-    pub is_in_montgomery_form: bool,
 }

 impl<'a> Default for VecOpsConfig<'a> {
@@ -38,7 +36,6 @@ impl<'a> VecOpsConfig<'a> {
            is_b_on_device: false,
            is_result_on_device: false,
            is_async: false,
-            is_in_montgomery_form: false,
        }
    }
 }
--- a/wrappers/rust/icicle-hash/src/keccak/mod.rs
+++ b/wrappers/rust/icicle-hash/src/keccak/mod.rs
@@ -25,6 +25,22 @@ extern "C" {
        config: &HashConfig,
    ) -> CudaError;

+    pub(crate) fn sha3_256_cuda(
+        input: *const u8,
+        input_block_size: u32,
+        number_of_blocks: u32,
+        output: *mut u8,
+        config: &HashConfig,
+    ) -> CudaError;
+
+    pub(crate) fn sha3_512_cuda(
+        input: *const u8,
+        input_block_size: u32,
+        number_of_blocks: u32,
+        output: *mut u8,
+        config: &HashConfig,
+    ) -> CudaError;
+
    pub(crate) fn build_keccak256_merkle_tree_cuda(
        leaves: *const u8,
        digests: *mut u64,
@@ -40,6 +56,22 @@ extern "C" {
        input_block_len: u32,
        config: &TreeBuilderConfig,
    ) -> CudaError;
+
+    pub(crate) fn build_sha3_256_merkle_tree_cuda(
+        leaves: *const u8,
+        digests: *mut u64,
+        height: u32,
+        input_block_len: u32,
+        config: &TreeBuilderConfig,
+    ) -> CudaError;
+
+    pub(crate) fn build_sha3_512_merkle_tree_cuda(
+        leaves: *const u8,
+        digests: *mut u64,
+        height: u32,
+        input_block_len: u32,
+        config: &TreeBuilderConfig,
+    ) -> CudaError;
 }

 pub fn keccak256(
@@ -86,6 +118,50 @@ pub fn keccak512(
    }
 }

+pub fn sha3_256(
+    input: &(impl HostOrDeviceSlice<u8> + ?Sized),
+    input_block_size: u32,
+    number_of_blocks: u32,
+    output: &mut (impl HostOrDeviceSlice<u8> + ?Sized),
+    config: &HashConfig,
+) -> IcicleResult<()> {
+    let mut local_cfg = config.clone();
+    local_cfg.are_inputs_on_device = input.is_on_device();
+    local_cfg.are_outputs_on_device = output.is_on_device();
+    unsafe {
+        sha3_256_cuda(
+            input.as_ptr(),
+            input_block_size,
+            number_of_blocks,
+            output.as_mut_ptr(),
+            &local_cfg,
+        )
+        .wrap()
+    }
+}
+
+pub fn sha3_512(
+    input: &(impl HostOrDeviceSlice<u8> + ?Sized),
+    input_block_size: u32,
+    number_of_blocks: u32,
+    output: &mut (impl HostOrDeviceSlice<u8> + ?Sized),
+    config: &HashConfig,
+) -> IcicleResult<()> {
+    let mut local_cfg = config.clone();
+    local_cfg.are_inputs_on_device = input.is_on_device();
+    local_cfg.are_outputs_on_device = output.is_on_device();
+    unsafe {
+        sha3_512_cuda(
+            input.as_ptr(),
+            input_block_size,
+            number_of_blocks,
+            output.as_mut_ptr(),
+            &local_cfg,
+        )
+        .wrap()
+    }
+}
+
 pub fn build_keccak256_merkle_tree(
    leaves: &(impl HostOrDeviceSlice<u8> + ?Sized),
    digests: &mut (impl HostOrDeviceSlice<u64> + ?Sized),
@@ -123,3 +199,41 @@ pub fn build_keccak512_merkle_tree(
        .wrap()
    }
 }
+
+pub fn build_sha3_256_merkle_tree(
+    leaves: &(impl HostOrDeviceSlice<u8> + ?Sized),
+    digests: &mut (impl HostOrDeviceSlice<u64> + ?Sized),
+    height: usize,
+    input_block_len: usize,
+    config: &TreeBuilderConfig,
+) -> IcicleResult<()> {
+    unsafe {
+        build_sha3_256_merkle_tree_cuda(
+            leaves.as_ptr(),
+            digests.as_mut_ptr(),
+            height as u32,
+            input_block_len as u32,
+            config,
+        )
+        .wrap()
+    }
+}
+
+pub fn build_sha3_512_merkle_tree(
+    leaves: &(impl HostOrDeviceSlice<u8> + ?Sized),
+    digests: &mut (impl HostOrDeviceSlice<u64> + ?Sized),
+    height: usize,
+    input_block_len: usize,
+    config: &TreeBuilderConfig,
+) -> IcicleResult<()> {
+    unsafe {
+        build_sha3_512_merkle_tree_cuda(
+            leaves.as_ptr(),
+            digests.as_mut_ptr(),
+            height as u32,
+            input_block_len as u32,
+            config,
+        )
+        .wrap()
+    }
+}
--- a/wrappers/rust/icicle-hash/src/keccak/tests.rs
+++ b/wrappers/rust/icicle-hash/src/keccak/tests.rs
@@ -15,7 +15,7 @@ pub(crate) mod tests {
        let number_of_hashes = 1024;

        let preimages = vec![1u8; number_of_hashes * input_block_len];
-        let mut digests = vec![0u8; number_of_hashes * 64];
+        let mut digests = vec![0u8; number_of_hashes * 32];

        let preimages_slice = HostSlice::from_slice(&preimages);
        let digests_slice = HostSlice::from_mut_slice(&mut digests);
Author	SHA1	Message	Date
Ethan-000	621676bd41	feat: add `num_bits()` function (#570 ) ## Describe the changes adding a `num_bits()` function similar to `dcf73a5f96/ff/src/biginteger/mod.rs (L482)` this could be useful for small field optimizations ## Linked Issues Resolves #	2024-08-07 09:37:16 +03:00
Otsar	badb8c5d68	Removed ZK containers from docs sidebar (#571 ) ## Describe the changes This PR... ## Linked Issues Resolves #	2024-08-04 18:38:37 +03:00
Otsar	1300434bbe	Removed ZK containers from docs sidebar	2024-08-04 11:14:06 +03:00
yshekel	6a67893773	remove the recommnedation to use zk-contariners in examples (#569 )	2024-08-01 14:58:02 +03:00
ChickenLover	0cb0b49be9	Add Sha3 (#560 ) ## Describe the changes This PR... ## Linked Issues Resolves #	2024-07-28 15:31:28 +07:00
Vlad	8411ed1451	Feat/vlad/refactor from affine (#554 ) ## Describe the changes This PR refactors the different affine to projective conversion functions using the C function also small bug fix for ProjectiveToAffine() function in Go ## Linked Issues Resolves #	2024-07-22 10:37:24 +02:00
Vlad	877018c84c	more go fmt	2024-07-15 16:55:40 +02:00
Vlad	91ac666e06	Merge branch 'feat/vlad/refactor-from-affine' of github.com:ingonyama-zk/icicle into feat/vlad/refactor-from-affine	2024-07-15 16:48:25 +02:00
Vlad	46e6c20440	go fmt	2024-07-15 16:47:40 +02:00
Vlad	e4eda8938d	go fmt	2024-07-05 21:29:44 +02:00
Vlad	fb707d5350	Merge branch 'main' into feat/vlad/refactor-from-affine	2024-07-05 15:40:34 +02:00
Vlad	6336e74d5a	refactor from_affine with C link	2024-07-04 11:03:58 +02:00
Vlad	279cdc66e0	generated go files	2024-07-03 10:41:32 +02:00
Vlad	81644fc28c	use zero method of projective in toProjective Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2024-07-03 10:37:02 +02:00
Vlad	17732ea013	use zero method of projective in fromAffine Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2024-07-03 10:36:14 +02:00
Vlad	9e057c835d	fixed to_projective in rust	2024-07-03 09:18:41 +02:00
Vlad	f08b5bb49d	fixed fromAffine and toProj in golang	2024-07-03 09:07:43 +02:00