refactor: generate curve-specific function names with macro instead of using objcopy to modify the symbols

This commit is contained in:
Yuval Shekel
2024-01-08 17:19:08 +02:00
parent b184befc99
commit 5fda751e93
14 changed files with 183 additions and 168 deletions

View File

@@ -7,37 +7,44 @@
// expose Poseidon classes
#include "curves/bls12_381/poseidon.cu"
// location of a tree node in the array for a given level and offset
inline uint32_t tree_index(uint32_t level, uint32_t offset) {
return (1 << level) - 1 + offset;
}
inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level) - 1 + offset; }
// We assume the tree has leaves already set, compute all other levels
void build_tree(const uint32_t tree_height, BLS12_381::scalar_t* tree, Poseidon<BLS12_381::scalar_t> &poseidon, cudaStream_t stream) {
for (uint32_t level = tree_height-1; level>0 ; level-- ) {
const uint32_t next_level = level -1;
void build_tree(
const uint32_t tree_height, BLS12_381::scalar_t* tree, Poseidon<BLS12_381::scalar_t>& poseidon, cudaStream_t stream)
{
for (uint32_t level = tree_height - 1; level > 0; level--) {
const uint32_t next_level = level - 1;
const uint32_t next_level_width = 1 << next_level;
poseidon.hash_blocks(&tree[tree_index(level,0)], next_level_width, &tree[tree_index(next_level,0)], Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
}
poseidon.hash_blocks(
&tree[tree_index(level, 0)], next_level_width, &tree[tree_index(next_level, 0)],
Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
}
}
// search leaves for a given hash, return offset
uint32_t query_membership(BLS12_381::scalar_t query, BLS12_381::scalar_t* tree, const uint32_t tree_height) {
const uint32_t tree_width = (1 << (tree_height-1));
for (uint32_t i=0; i<tree_width; i++) {
uint32_t query_membership(BLS12_381::scalar_t query, BLS12_381::scalar_t* tree, const uint32_t tree_height)
{
const uint32_t tree_width = (1 << (tree_height - 1));
for (uint32_t i = 0; i < tree_width; i++) {
const BLS12_381::scalar_t leaf = tree[tree_index(tree_height - 1, i)];
if (leaf == query ) {
if (leaf == query) {
return i; // found the hash
}
}
}
return tree_height; // hash not found
return tree_height; // hash not found
}
void generate_proof(uint32_t position, BLS12_381::scalar_t* tree, const uint32_t tree_height, uint32_t* proof_lr, BLS12_381::scalar_t* proof_hash ) {
void generate_proof(
uint32_t position,
BLS12_381::scalar_t* tree,
const uint32_t tree_height,
uint32_t* proof_lr,
BLS12_381::scalar_t* proof_hash)
{
uint32_t level_index = position;
for(uint32_t level = tree_height - 1; level > 0; level--) {
for (uint32_t level = tree_height - 1; level > 0; level--) {
uint32_t lr;
uint32_t neighbour_index;
lr = level_index % 2;
@@ -49,18 +56,25 @@ void generate_proof(uint32_t position, BLS12_381::scalar_t* tree, const uint32_t
neighbour_index = level_index - 1;
}
proof_lr[level] = lr;
proof_hash[level] = tree[tree_index(level,neighbour_index)];
proof_hash[level] = tree[tree_index(level, neighbour_index)];
level_index /= 2;
}
// the proof must match this:
proof_hash[0] = tree[tree_index(0,0)];
proof_hash[0] = tree[tree_index(0, 0)];
}
uint32_t validate_proof(const BLS12_381::scalar_t hash, const uint32_t tree_height, const uint32_t* proof_lr, const BLS12_381::scalar_t* proof_hash, Poseidon<BLS12_381::scalar_t> &poseidon, cudaStream_t stream) {
uint32_t validate_proof(
const BLS12_381::scalar_t hash,
const uint32_t tree_height,
const uint32_t* proof_lr,
const BLS12_381::scalar_t* proof_hash,
Poseidon<BLS12_381::scalar_t>& poseidon,
cudaStream_t stream)
{
BLS12_381::scalar_t hashes_in[2], hash_out[1], level_hash;
level_hash = hash;
for(uint32_t level = tree_height - 1; level > 0; level --) {
if(proof_lr[level]==0) {
for (uint32_t level = tree_height - 1; level > 0; level--) {
if (proof_lr[level] == 0) {
hashes_in[0] = level_hash;
hashes_in[1] = proof_hash[level];
} else {
@@ -76,12 +90,11 @@ uint32_t validate_proof(const BLS12_381::scalar_t hash, const uint32_t tree_heig
int main(int argc, char* argv[])
{
std::cout << "1. Defining the size of the example: height of the full binary Merkle tree" << std::endl;
const uint32_t tree_height = 21;
std::cout << "Tree height: " << tree_height << std::endl;
const uint32_t tree_arity = 2;
const uint32_t leaf_level = tree_height-1;
const uint32_t leaf_level = tree_height - 1;
const uint32_t tree_width = 1 << leaf_level;
std::cout << "Tree width: " << tree_width << std::endl;
const uint32_t tree_size = (1 << tree_height) - 1;
@@ -93,7 +106,8 @@ int main(int argc, char* argv[])
std::cout << "Block size (arity): " << data_arity << std::endl;
std::cout << "Initializing blocks..." << std::endl;
BLS12_381::scalar_t d = BLS12_381::scalar_t::zero();
BLS12_381::scalar_t* data = static_cast<BLS12_381::scalar_t*>(malloc(tree_width * data_arity * sizeof(BLS12_381::scalar_t)));
BLS12_381::scalar_t* data =
static_cast<BLS12_381::scalar_t*>(malloc(tree_width * data_arity * sizeof(BLS12_381::scalar_t)));
for (uint32_t i = 0; i < tree_width * data_arity; i++) {
data[i] = d;
d = d + BLS12_381::scalar_t::one();
@@ -103,31 +117,33 @@ int main(int argc, char* argv[])
cudaStream_t stream;
cudaStreamCreate(&stream);
Poseidon<BLS12_381::scalar_t> data_poseidon(data_arity, stream);
data_poseidon.hash_blocks(data, tree_width, &tree[tree_index(leaf_level, 0)], Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
data_poseidon.hash_blocks(
data, tree_width, &tree[tree_index(leaf_level, 0)], Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
std::cout << "3. Building Merkle tree" << std::endl;
Poseidon<BLS12_381::scalar_t> tree_poseidon(tree_arity, stream);
build_tree(tree_height, tree, tree_poseidon, stream);
std::cout << "4. Generate membership proof" << std::endl;
uint32_t position = tree_width-1;
uint32_t position = tree_width - 1;
std::cout << "Using the hash for block: " << position << std::endl;
BLS12_381::scalar_t query = tree[tree_index(leaf_level, position)];
uint32_t query_position = query_membership(query, tree, tree_height);
// allocate arrays for the proof
uint32_t* proof_lr = static_cast<uint32_t*>(malloc(tree_height * sizeof(uint32_t)));
BLS12_381::scalar_t* proof_hash = static_cast<BLS12_381::scalar_t*>(malloc(tree_height * sizeof(BLS12_381::scalar_t)));
generate_proof(query_position, tree, tree_height, proof_lr, proof_hash );
BLS12_381::scalar_t* proof_hash =
static_cast<BLS12_381::scalar_t*>(malloc(tree_height * sizeof(BLS12_381::scalar_t)));
generate_proof(query_position, tree, tree_height, proof_lr, proof_hash);
std::cout << "5. Validate the hash membership" << std::endl;
uint32_t validated;
const BLS12_381::scalar_t hash = tree[tree_index(leaf_level, query_position)];
validated = validate_proof(hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);
validated = validate_proof(hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);
std::cout << "Validated: " << validated << std::endl;
std::cout << "6. Tamper the hash" << std::endl;
const BLS12_381::scalar_t tampered_hash = hash + BLS12_381::scalar_t::one();
validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);
validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);
std::cout << "7. Invalidate tamper hash membership" << std::endl;
std::cout << "Validated: " << validated << std::endl;
return 0;

View File

@@ -2,7 +2,6 @@
#include <fstream>
#include <iostream>
// include MSM template
#include "appUtils/msm/msm.cu"
// select the curve
@@ -42,7 +41,8 @@ int main(int argc, char* argv[])
cudaStream_t stream1;
cudaStreamCreate(&stream1);
auto begin = std::chrono::high_resolution_clock::now();
large_msm<scalar_t, projective_t, affine_t>(scalars_d, points_d, msm_size, result_d, true, false, bucket_factor, stream1);
large_msm<scalar_t, projective_t, affine_t>(
scalars_d, points_d, msm_size, result_d, true, false, bucket_factor, stream1);
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("On-device runtime: %.3f seconds.\n", elapsed.count() * 1e-9);

View File

@@ -1,5 +1,5 @@
#include <iostream>
#include <iomanip>
#include <iomanip>
#include <chrono>
#include <cuda_runtime.h>
#include <nvml.h>
@@ -15,51 +15,48 @@ const std::string curve = "BN254";
#define MAX_THREADS_PER_BLOCK 256
template <typename T>
__global__ void vectorMult(T *vec_a, T *vec_b, T *vec_r, size_t n_elments)
__global__ void vectorMult(T* vec_a, T* vec_b, T* vec_r, size_t n_elments)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n_elments)
{
vec_r[tid] = vec_a[tid] * vec_b[tid];
}
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n_elments) { vec_r[tid] = vec_a[tid] * vec_b[tid]; }
}
template <typename T>
int vector_mult(T *vec_b, T *vec_a, T *vec_result, size_t n_elments)
int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments)
{
// Set the grid and block dimensions
int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
int threads_per_block = MAX_THREADS_PER_BLOCK;
// Set the grid and block dimensions
int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
int threads_per_block = MAX_THREADS_PER_BLOCK;
// Call the kernel to perform element-wise modular multiplication
vectorMult<T><<<num_blocks, threads_per_block>>>(vec_a, vec_b, vec_result, n_elments);
return 0;
// Call the kernel to perform element-wise modular multiplication
vectorMult<T><<<num_blocks, threads_per_block>>>(vec_a, vec_b, vec_result, n_elments);
return 0;
}
int main(int argc, char** argv)
{
const unsigned vector_size = 1 << 20;
const unsigned repetitions = 1 << 20;
const unsigned repetitions = 1 << 20;
cudaError_t err;
nvmlInit();
nvmlDevice_t device;
nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0
nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0
std::cout << "Icicle-Examples: vector multiplications" << std::endl;
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
if (nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE) == NVML_SUCCESS) {
std::cout << "GPU Model: " << name << std::endl;
} else {
std::cerr << "Failed to get GPU model name." << std::endl;
}
std::cout << "GPU Model: " << name << std::endl;
} else {
std::cerr << "Failed to get GPU model name." << std::endl;
}
unsigned power_limit;
nvmlDeviceGetPowerManagementLimit(device, &power_limit);
std::cout << "Vector size: " << vector_size << std::endl;
std::cout << "Repetitions: " << repetitions << std::endl;
std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
unsigned int baseline_power;
nvmlDeviceGetPowerUsage(device, &baseline_power);
std::cout << "Baseline power: " << std::fixed << std::setprecision(3) << 1.0e-3 * baseline_power << " W" << std::endl;
@@ -75,8 +72,7 @@ int main(int argc, char** argv)
T* host_in2 = (T*)malloc(vector_size * sizeof(T));
std::cout << "Initializing vectors with random data" << std::endl;
for (int i = 0; i < vector_size; i++) {
if ( (i>0) && i % (1<<20) == 0)
std::cout << "Elements: " << i << std::endl;
if ((i > 0) && i % (1 << 20) == 0) std::cout << "Elements: " << i << std::endl;
host_in1[i] = T::rand_host();
host_in2[i] = T::rand_host();
}
@@ -89,40 +85,39 @@ int main(int argc, char** argv)
err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
if (err != cudaSuccess) {
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
}
err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
if (err != cudaSuccess) {
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
}
err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
if (err != cudaSuccess) {
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
}
// copy from host to device
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
return 0;
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
return 0;
}
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
return 0;
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
return 0;
}
std::cout << "Starting warm-up" << std::endl;
// Warm-up loop
for (int i = 0; i < repetitions; i++) {
@@ -138,7 +133,8 @@ int main(int argc, char** argv)
unsigned power_before;
nvmlDeviceGetPowerUsage(device, &power_before);
std::cout << "Power before: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_before << " W" << std::endl;
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float) 100.0 * power_before / power_limit << " %" << std::endl;
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_before / power_limit
<< " %" << std::endl;
unsigned temperature_before;
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_before) == NVML_SUCCESS) {
std::cout << "GPU Temperature before: " << temperature_before << " C" << std::endl;
@@ -156,14 +152,15 @@ int main(int argc, char** argv)
// return 0;
// }
}
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
std::cout << "Elapsed time: " << duration.count() << " microseconds" << std::endl;
unsigned power_after;
nvmlDeviceGetPowerUsage(device, &power_after);
std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float) 100.0 * power_after / power_limit << " %" << std::endl;
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
<< " %" << std::endl;
unsigned temperature_after;
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
@@ -172,19 +169,19 @@ int main(int argc, char** argv)
}
// Report performance in GMPS: Giga Multiplications Per Second
double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count()) ;
double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
// Optional: validate multiplication
T * host_out = (T*)malloc(vector_size * sizeof(T));
T* host_out = (T*)malloc(vector_size * sizeof(T));
cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
// validate multiplication here...
free(host_in1);
free(host_in1);
free(host_in2);
free(host_out);
free(host_out);
cudaFree(device_in1);
cudaFree(device_in2);
cudaFree(device_out);
@@ -192,5 +189,4 @@ int main(int argc, char** argv)
nvmlShutdown();
return 0;
}

View File

@@ -1,7 +1,6 @@
#include <chrono>
#include <iostream>
// include NTT template
#include "icicle/appUtils/ntt/ntt.cuh"
@@ -13,29 +12,32 @@ using namespace BLS12_381;
typedef scalar_t S;
typedef scalar_t E;
scalar_t smult(const unsigned n, scalar_t s) {
scalar_t smult(const unsigned n, scalar_t s)
{
scalar_t r = scalar_t::zero();
for (unsigned i=0; i < n; i++) {
for (unsigned i = 0; i < n; i++) {
r = r + s;
}
return r;
}
void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
{
// Harmonics 0
for (unsigned i = 0; i < ntt_size; i=i+1) {
for (unsigned i = 0; i < ntt_size; i = i + 1) {
elements[i] = scalar_t::one();
}
// Harmonics 1
for (unsigned i = 1*ntt_size; i < 2*ntt_size; i=i+2) {
for (unsigned i = 1 * ntt_size; i < 2 * ntt_size; i = i + 2) {
elements[i] = scalar_t::one();
elements[i+1] = scalar_t::neg(scalar_t::one());
elements[i + 1] = scalar_t::neg(scalar_t::one());
}
}
int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
{
int nof_errors = 0;
E amplitude = smult(ntt_size,scalar_t::one());
E amplitude = smult(ntt_size, scalar_t::one());
// std::cout << "Amplitude: " << amplitude << std::endl;
// Harmonics 0
if (elements[0] != amplitude) {
@@ -45,14 +47,14 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E * elemen
std::cout << "Validated harmonics 0" << std::endl;
}
// Harmonics 1
if (elements[ntt_size+1] != amplitude) {
if (elements[ntt_size + 1] != amplitude) {
++nof_errors;
std::cout << "Error in harmonics 1: " << elements[ntt_size+1] << std::endl;
std::cout << "Error in harmonics 1: " << elements[ntt_size + 1] << std::endl;
} else {
std::cout << "Validated harmonics 1" << std::endl;
}
// for (unsigned i = 0; i < nof_ntts * ntt_size; i++) {
// std::cout << elements[i] << std::endl;
// std::cout << elements[i] << std::endl;
// }
return nof_errors;
}
@@ -68,35 +70,35 @@ int main(int argc, char* argv[])
const unsigned nof_ntts = 2;
std::cout << "Number of NTTs: " << nof_ntts << std::endl;
const unsigned batch_size = nof_ntts * ntt_size;
std::cout << "Generating input data for harmonics 0,1" << std::endl;
E* elements;
elements = (scalar_t*) malloc(sizeof(E) * batch_size);
initialize_input(ntt_size, nof_ntts, elements );
elements = (scalar_t*)malloc(sizeof(E) * batch_size);
initialize_input(ntt_size, nof_ntts, elements);
std::cout << "Running easy-to-use NTT" << std::endl;
cudaStream_t stream;
cudaStreamCreate(&stream);
bool inverse = false;
auto begin0 = std::chrono::high_resolution_clock::now();
ntt_end2end_batch_template<scalar_t, scalar_t>(elements, batch_size, ntt_size, inverse, stream);
auto end0 = std::chrono::high_resolution_clock::now();
auto elapsed0 = std::chrono::duration_cast<std::chrono::nanoseconds>(end0 - begin0);
printf("On-device runtime: %.3f seconds\n", elapsed0.count() * 1e-9);
validate_output(ntt_size, nof_ntts, elements );
validate_output(ntt_size, nof_ntts, elements);
cudaStreamSynchronize(stream);
std::cout << "Running not that easy-to-use but fast NTT" << std::endl;
uint32_t n_twiddles = ntt_size; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order.
// represent transform matrix using twiddle factors
scalar_t * d_twiddles;
scalar_t* d_twiddles;
d_twiddles = fill_twiddle_factors_array(n_twiddles, scalar_t::omega(log_ntt_size), stream); // Sscalar
scalar_t* d_elements; // Element
scalar_t* d_elements; // Element
cudaMallocAsync(&d_elements, sizeof(scalar_t) * batch_size, stream);
initialize_input(ntt_size, nof_ntts, elements );
initialize_input(ntt_size, nof_ntts, elements);
cudaMemcpyAsync(d_elements, elements, sizeof(scalar_t) * batch_size, cudaMemcpyHostToDevice, stream);
S* _null = nullptr;
auto begin1 = std::chrono::high_resolution_clock::now();
@@ -106,12 +108,12 @@ int main(int argc, char* argv[])
auto end1 = std::chrono::high_resolution_clock::now();
auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
printf("Runtime: %.3e seconds\n", elapsed1.count() * 1e-9);
cudaMemcpyAsync(elements, d_elements, sizeof(E) * batch_size, cudaMemcpyDeviceToHost, stream);
validate_output(ntt_size, nof_ntts, elements );
validate_output(ntt_size, nof_ntts, elements);
cudaFreeAsync(d_elements, stream);
cudaFreeAsync(d_twiddles, stream);
cudaStreamDestroy(stream);
free(elements);

View File

@@ -64,6 +64,7 @@ project(icicle LANGUAGES CUDA CXX)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
include_directories("${CMAKE_SOURCE_DIR}")
# when adding a new curve/field, append its name to the end of this list
@@ -98,35 +99,8 @@ if (NOT BUILD_TESTS)
appUtils/msm/msm.cu
appUtils/ntt/ntt.cu
)
#set_target_properties(icicle PROPERTIES CUDA_SEPARABLE_COMPILATION ON) #TODO: enable asap
#set_target_properties(icicle PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_compile_options(icicle PRIVATE -c)
add_custom_command(
TARGET icicle
POST_BUILD
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym MSMCuda=${CURVE}MSMCuda ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym DefaultMSMConfig=${CURVE}DefaultMSMConfig ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym NTTCuda=${CURVE}NTTCuda ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym GetDefaultNTTConfig=${CURVE}DefaultNTTConfig ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym InitializeDomain=${CURVE}InitializeDomain ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym GenerateScalars=${CURVE}GenerateScalars ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/field.cu.o
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym Eq=${CURVE}Eq ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym ToAffine=${CURVE}ToAffine ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym GenerateProjectivePoints=${CURVE}GenerateProjectivePoints ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym GenerateAffinePoints=${CURVE}GenerateAffinePoints ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym ScalarConvertMontgomery=${CURVE}ScalarConvertMontgomery ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/mont.cu.o
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym AffineConvertMontgomery=${CURVE}AffineConvertMontgomery ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/mont.cu.o
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym ProjectiveConvertMontgomery=${CURVE}ProjectiveConvertMontgomery ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/mont.cu.o
COMMAND ${CMAKE_AR} ARGS -rcs ${PROJECT_BINARY_DIR}/libingo_${CURVE}.a
${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/vec_ops.cu.o
${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/field.cu.o
${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/mont.cu.o
)
set_target_properties(icicle PROPERTIES OUTPUT_NAME "ingo_${CURVE}")
target_compile_definitions(icicle PRIVATE CURVE=${CURVE})
else()

View File

@@ -1,4 +1,4 @@
test_msm:
mkdir -p work
nvcc -o work/test_msm -I. tests/msm_test.cu
nvcc -o work/test_msm -I. -I../.. tests/msm_test.cu
work/test_msm

View File

@@ -10,13 +10,14 @@
#include <stdexcept>
#include <vector>
#include "../../curves/curve_config.cuh"
#include "../../primitives/affine.cuh"
#include "../../primitives/field.cuh"
#include "../../primitives/projective.cuh"
#include "../../utils/cuda_utils.cuh"
#include "../../utils/error_handler.cuh"
#include "../../utils/mont.cuh"
#include "curves/curve_config.cuh"
#include "primitives/affine.cuh"
#include "primitives/field.cuh"
#include "primitives/projective.cuh"
#include "utils/cuda_utils.cuh"
#include "utils/error_handler.cuh"
#include "utils/mont.cuh"
#include "utils/utils.h"
namespace msm {
@@ -966,7 +967,7 @@ namespace msm {
} // namespace
extern "C" MSMConfig DefaultMSMConfig()
extern "C" MSMConfig CONCAT_EXPAND(CURVE, DefaultMSMConfig)()
{
device_context::DeviceContext ctx = device_context::get_default_device_context();
MSMConfig config = {
@@ -1016,7 +1017,7 @@ namespace msm {
* - `P` is the [projective representation](@ref projective_t) of curve points.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t MSMCuda(
extern "C" cudaError_t CONCAT_EXPAND(CURVE, MSMCuda)(
curve_config::scalar_t* scalars,
curve_config::affine_t* points,
int msm_size,

View File

@@ -3,9 +3,10 @@
#include <unordered_map>
#include <vector>
#include "../../curves/curve_config.cuh"
#include "../../utils/sharedmem.cuh"
#include "../../utils/utils_kernels.cuh"
#include "curves/curve_config.cuh"
#include "utils/sharedmem.cuh"
#include "utils/utils_kernels.cuh"
#include "utils/utils.h"
namespace ntt {
@@ -508,7 +509,7 @@ namespace ntt {
* - `S` is the [scalar field](@ref scalar_t) of the curve;
* @return Default [NTTConfig](@ref NTTConfig).
*/
extern "C" NTTConfig<curve_config::scalar_t> GetDefaultNTTConfig()
extern "C" NTTConfig<curve_config::scalar_t> CONCAT_EXPAND(CURVE, GetDefaultNTTConfig)()
{
return DefaultNTTConfig<curve_config::scalar_t>();
}
@@ -518,7 +519,8 @@ namespace ntt {
* value of template parameter (where the curve is given by `-DCURVE` env variable during build):
* - `S` is the [scalar field](@ref scalar_t) of the curve;
*/
extern "C" cudaError_t InitializeDomain(curve_config::scalar_t primitive_root, device_context::DeviceContext& ctx)
extern "C" cudaError_t
CONCAT_EXPAND(CURVE, InitializeDomain)(curve_config::scalar_t primitive_root, device_context::DeviceContext& ctx)
{
return InitDomain(primitive_root, ctx);
}
@@ -529,7 +531,7 @@ namespace ntt {
* - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve;
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t NTTCuda(
extern "C" cudaError_t CONCAT_EXPAND(CURVE, NTTCuda)(
curve_config::scalar_t* input,
int size,
NTTDir dir,

View File

@@ -4,11 +4,12 @@
#include <cuda_runtime.h>
#include "../../curves/curve_config.cuh"
#include "../../utils/device_context.cuh"
#include "../../utils/error_handler.cuh"
#include "../../utils/sharedmem.cuh"
#include "../../utils/utils_kernels.cuh"
#include "curves/curve_config.cuh"
#include "utils/device_context.cuh"
#include "utils/error_handler.cuh"
#include "utils/sharedmem.cuh"
#include "utils/utils_kernels.cuh"
#include "utils/utils.h"
/**
* @namespace ntt
@@ -86,7 +87,7 @@ namespace ntt {
* @return Default value of [NTTConfig](@ref NTTConfig).
*/
template <typename S>
NTTConfig<S> DefaultNTTConfig();
NTTConfig<S> CONCAT_EXPAND(CURVE, DefaultNTTConfig)();
/**
* A function that computes NTT or iNTT in-place.

View File

@@ -1,6 +1,10 @@
#include "../curves/curve_config.cuh"
#include "curves/curve_config.cuh"
#include "field.cuh"
#include "utils/utils.h"
#define scalar_t curve_config::scalar_t
extern "C" void GenerateScalars(scalar_t* scalars, int size) { scalar_t::RandHostMany(scalars, size); }
extern "C" void CONCAT_EXPAND(CURVE, GenerateScalars)(scalar_t* scalars, int size)
{
scalar_t::RandHostMany(scalars, size);
}

View File

@@ -1,12 +1,13 @@
#include "../curves/curve_config.cuh"
#include "curves/curve_config.cuh"
#include "projective.cuh"
#include <cuda.h>
#include "utils/utils.h"
#define projective_t curve_config::projective_t // TODO: global to avoid lengthy texts
#define affine_t curve_config::affine_t
#define point_field_t curve_config::point_field_t
extern "C" bool Eq(projective_t* point1, projective_t* point2)
extern "C" bool CONCAT_EXPAND(CURVE, Eq)(projective_t* point1, projective_t* point2)
{
return (*point1 == *point2) &&
!((point1->x == point_field_t::zero()) && (point1->y == point_field_t::zero()) &&
@@ -15,11 +16,20 @@ extern "C" bool Eq(projective_t* point1, projective_t* point2)
(point2->z == point_field_t::zero()));
}
extern "C" void ToAffine(projective_t* point, affine_t* point_out) { *point_out = projective_t::to_affine(*point); }
extern "C" void CONCAT_EXPAND(CURVE, ToAffine)(projective_t* point, affine_t* point_out)
{
*point_out = projective_t::to_affine(*point);
}
extern "C" void GenerateProjectivePoints(projective_t* points, int size) { projective_t::RandHostMany(points, size); }
extern "C" void CONCAT_EXPAND(CURVE, GenerateProjectivePoints)(projective_t* points, int size)
{
projective_t::RandHostMany(points, size);
}
extern "C" void GenerateAffinePoints(affine_t* points, int size) { projective_t::RandHostManyAffine(points, size); }
extern "C" void CONCAT_EXPAND(CURVE, GenerateAffinePoints)(affine_t* points, int size)
{
projective_t::RandHostManyAffine(points, size);
}
#if defined(G2_DEFINED)

View File

@@ -1,10 +1,11 @@
#include "../curves/curve_config.cuh"
#include "curves/curve_config.cuh"
#include "device_context.cuh"
#include "mont.cuh"
#include "utils/utils.h"
namespace mont {
extern "C" cudaError_t
ScalarConvertMontgomery(curve_config::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
extern "C" cudaError_t CONCAT_EXPAND(CURVE, ScalarConvertMontgomery)(
curve_config::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
{
if (is_into) {
return ToMontgomery(d_inout, n, ctx.stream, d_inout);
@@ -13,8 +14,8 @@ namespace mont {
}
}
extern "C" cudaError_t
AffineConvertMontgomery(curve_config::affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
extern "C" cudaError_t CONCAT_EXPAND(CURVE, AffineConvertMontgomery)(
curve_config::affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
{
if (is_into) {
return ToMontgomery(d_inout, n, ctx.stream, d_inout);
@@ -23,7 +24,7 @@ namespace mont {
}
}
extern "C" cudaError_t ProjectiveConvertMontgomery(
extern "C" cudaError_t CONCAT_EXPAND(CURVE, ProjectiveConvertMontgomery)(
curve_config::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
{
if (is_into) {

8
icicle/utils/utils.h Normal file
View File

@@ -0,0 +1,8 @@
#pragma once
#ifndef ICICLE_UTILS_H
#define ICICLE_UTILS_H
#define CONCAT_DIRECT(a, b) a##b
#define CONCAT_EXPAND(a, b) CONCAT_DIRECT(a, b) // expand a,b before concatenation
#endif // ICICLE_UTILS_H

View File

@@ -149,7 +149,7 @@ macro_rules! impl_ntt {
output: *mut $field,
) -> CudaError;
#[link_name = concat!($field_prefix, "DefaultNTTConfig")]
#[link_name = concat!($field_prefix, "GetDefaultNTTConfig")]
fn default_ntt_config() -> NTTConfig<'static, $field>;
#[link_name = concat!($field_prefix, "InitializeDomain")]