mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-09 13:07:59 -05:00
refactor: generate curve-specific function names with macro instead of using objcopy to modify the symbols
This commit is contained in:
@@ -7,37 +7,44 @@
|
||||
// expose Poseidon classes
|
||||
#include "curves/bls12_381/poseidon.cu"
|
||||
|
||||
|
||||
|
||||
// location of a tree node in the array for a given level and offset
|
||||
inline uint32_t tree_index(uint32_t level, uint32_t offset) {
|
||||
return (1 << level) - 1 + offset;
|
||||
}
|
||||
inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level) - 1 + offset; }
|
||||
|
||||
// We assume the tree has leaves already set, compute all other levels
|
||||
void build_tree(const uint32_t tree_height, BLS12_381::scalar_t* tree, Poseidon<BLS12_381::scalar_t> &poseidon, cudaStream_t stream) {
|
||||
for (uint32_t level = tree_height-1; level>0 ; level-- ) {
|
||||
const uint32_t next_level = level -1;
|
||||
void build_tree(
|
||||
const uint32_t tree_height, BLS12_381::scalar_t* tree, Poseidon<BLS12_381::scalar_t>& poseidon, cudaStream_t stream)
|
||||
{
|
||||
for (uint32_t level = tree_height - 1; level > 0; level--) {
|
||||
const uint32_t next_level = level - 1;
|
||||
const uint32_t next_level_width = 1 << next_level;
|
||||
poseidon.hash_blocks(&tree[tree_index(level,0)], next_level_width, &tree[tree_index(next_level,0)], Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
|
||||
}
|
||||
poseidon.hash_blocks(
|
||||
&tree[tree_index(level, 0)], next_level_width, &tree[tree_index(next_level, 0)],
|
||||
Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
|
||||
}
|
||||
}
|
||||
|
||||
// search leaves for a given hash, return offset
|
||||
uint32_t query_membership(BLS12_381::scalar_t query, BLS12_381::scalar_t* tree, const uint32_t tree_height) {
|
||||
const uint32_t tree_width = (1 << (tree_height-1));
|
||||
for (uint32_t i=0; i<tree_width; i++) {
|
||||
uint32_t query_membership(BLS12_381::scalar_t query, BLS12_381::scalar_t* tree, const uint32_t tree_height)
|
||||
{
|
||||
const uint32_t tree_width = (1 << (tree_height - 1));
|
||||
for (uint32_t i = 0; i < tree_width; i++) {
|
||||
const BLS12_381::scalar_t leaf = tree[tree_index(tree_height - 1, i)];
|
||||
if (leaf == query ) {
|
||||
if (leaf == query) {
|
||||
return i; // found the hash
|
||||
}
|
||||
}
|
||||
}
|
||||
return tree_height; // hash not found
|
||||
return tree_height; // hash not found
|
||||
}
|
||||
|
||||
void generate_proof(uint32_t position, BLS12_381::scalar_t* tree, const uint32_t tree_height, uint32_t* proof_lr, BLS12_381::scalar_t* proof_hash ) {
|
||||
void generate_proof(
|
||||
uint32_t position,
|
||||
BLS12_381::scalar_t* tree,
|
||||
const uint32_t tree_height,
|
||||
uint32_t* proof_lr,
|
||||
BLS12_381::scalar_t* proof_hash)
|
||||
{
|
||||
uint32_t level_index = position;
|
||||
for(uint32_t level = tree_height - 1; level > 0; level--) {
|
||||
for (uint32_t level = tree_height - 1; level > 0; level--) {
|
||||
uint32_t lr;
|
||||
uint32_t neighbour_index;
|
||||
lr = level_index % 2;
|
||||
@@ -49,18 +56,25 @@ void generate_proof(uint32_t position, BLS12_381::scalar_t* tree, const uint32_t
|
||||
neighbour_index = level_index - 1;
|
||||
}
|
||||
proof_lr[level] = lr;
|
||||
proof_hash[level] = tree[tree_index(level,neighbour_index)];
|
||||
proof_hash[level] = tree[tree_index(level, neighbour_index)];
|
||||
level_index /= 2;
|
||||
}
|
||||
// the proof must match this:
|
||||
proof_hash[0] = tree[tree_index(0,0)];
|
||||
proof_hash[0] = tree[tree_index(0, 0)];
|
||||
}
|
||||
|
||||
uint32_t validate_proof(const BLS12_381::scalar_t hash, const uint32_t tree_height, const uint32_t* proof_lr, const BLS12_381::scalar_t* proof_hash, Poseidon<BLS12_381::scalar_t> &poseidon, cudaStream_t stream) {
|
||||
uint32_t validate_proof(
|
||||
const BLS12_381::scalar_t hash,
|
||||
const uint32_t tree_height,
|
||||
const uint32_t* proof_lr,
|
||||
const BLS12_381::scalar_t* proof_hash,
|
||||
Poseidon<BLS12_381::scalar_t>& poseidon,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
BLS12_381::scalar_t hashes_in[2], hash_out[1], level_hash;
|
||||
level_hash = hash;
|
||||
for(uint32_t level = tree_height - 1; level > 0; level --) {
|
||||
if(proof_lr[level]==0) {
|
||||
for (uint32_t level = tree_height - 1; level > 0; level--) {
|
||||
if (proof_lr[level] == 0) {
|
||||
hashes_in[0] = level_hash;
|
||||
hashes_in[1] = proof_hash[level];
|
||||
} else {
|
||||
@@ -76,12 +90,11 @@ uint32_t validate_proof(const BLS12_381::scalar_t hash, const uint32_t tree_heig
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
|
||||
std::cout << "1. Defining the size of the example: height of the full binary Merkle tree" << std::endl;
|
||||
const uint32_t tree_height = 21;
|
||||
std::cout << "Tree height: " << tree_height << std::endl;
|
||||
const uint32_t tree_arity = 2;
|
||||
const uint32_t leaf_level = tree_height-1;
|
||||
const uint32_t leaf_level = tree_height - 1;
|
||||
const uint32_t tree_width = 1 << leaf_level;
|
||||
std::cout << "Tree width: " << tree_width << std::endl;
|
||||
const uint32_t tree_size = (1 << tree_height) - 1;
|
||||
@@ -93,7 +106,8 @@ int main(int argc, char* argv[])
|
||||
std::cout << "Block size (arity): " << data_arity << std::endl;
|
||||
std::cout << "Initializing blocks..." << std::endl;
|
||||
BLS12_381::scalar_t d = BLS12_381::scalar_t::zero();
|
||||
BLS12_381::scalar_t* data = static_cast<BLS12_381::scalar_t*>(malloc(tree_width * data_arity * sizeof(BLS12_381::scalar_t)));
|
||||
BLS12_381::scalar_t* data =
|
||||
static_cast<BLS12_381::scalar_t*>(malloc(tree_width * data_arity * sizeof(BLS12_381::scalar_t)));
|
||||
for (uint32_t i = 0; i < tree_width * data_arity; i++) {
|
||||
data[i] = d;
|
||||
d = d + BLS12_381::scalar_t::one();
|
||||
@@ -103,31 +117,33 @@ int main(int argc, char* argv[])
|
||||
cudaStream_t stream;
|
||||
cudaStreamCreate(&stream);
|
||||
Poseidon<BLS12_381::scalar_t> data_poseidon(data_arity, stream);
|
||||
data_poseidon.hash_blocks(data, tree_width, &tree[tree_index(leaf_level, 0)], Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
|
||||
data_poseidon.hash_blocks(
|
||||
data, tree_width, &tree[tree_index(leaf_level, 0)], Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
|
||||
|
||||
std::cout << "3. Building Merkle tree" << std::endl;
|
||||
Poseidon<BLS12_381::scalar_t> tree_poseidon(tree_arity, stream);
|
||||
build_tree(tree_height, tree, tree_poseidon, stream);
|
||||
|
||||
std::cout << "4. Generate membership proof" << std::endl;
|
||||
uint32_t position = tree_width-1;
|
||||
uint32_t position = tree_width - 1;
|
||||
std::cout << "Using the hash for block: " << position << std::endl;
|
||||
BLS12_381::scalar_t query = tree[tree_index(leaf_level, position)];
|
||||
uint32_t query_position = query_membership(query, tree, tree_height);
|
||||
// allocate arrays for the proof
|
||||
uint32_t* proof_lr = static_cast<uint32_t*>(malloc(tree_height * sizeof(uint32_t)));
|
||||
BLS12_381::scalar_t* proof_hash = static_cast<BLS12_381::scalar_t*>(malloc(tree_height * sizeof(BLS12_381::scalar_t)));
|
||||
generate_proof(query_position, tree, tree_height, proof_lr, proof_hash );
|
||||
BLS12_381::scalar_t* proof_hash =
|
||||
static_cast<BLS12_381::scalar_t*>(malloc(tree_height * sizeof(BLS12_381::scalar_t)));
|
||||
generate_proof(query_position, tree, tree_height, proof_lr, proof_hash);
|
||||
|
||||
std::cout << "5. Validate the hash membership" << std::endl;
|
||||
uint32_t validated;
|
||||
const BLS12_381::scalar_t hash = tree[tree_index(leaf_level, query_position)];
|
||||
validated = validate_proof(hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);
|
||||
validated = validate_proof(hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);
|
||||
std::cout << "Validated: " << validated << std::endl;
|
||||
|
||||
std::cout << "6. Tamper the hash" << std::endl;
|
||||
const BLS12_381::scalar_t tampered_hash = hash + BLS12_381::scalar_t::one();
|
||||
validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);
|
||||
validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);
|
||||
std::cout << "7. Invalidate tamper hash membership" << std::endl;
|
||||
std::cout << "Validated: " << validated << std::endl;
|
||||
return 0;
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
|
||||
// include MSM template
|
||||
#include "appUtils/msm/msm.cu"
|
||||
// select the curve
|
||||
@@ -42,7 +41,8 @@ int main(int argc, char* argv[])
|
||||
cudaStream_t stream1;
|
||||
cudaStreamCreate(&stream1);
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
large_msm<scalar_t, projective_t, affine_t>(scalars_d, points_d, msm_size, result_d, true, false, bucket_factor, stream1);
|
||||
large_msm<scalar_t, projective_t, affine_t>(
|
||||
scalars_d, points_d, msm_size, result_d, true, false, bucket_factor, stream1);
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
|
||||
printf("On-device runtime: %.3f seconds.\n", elapsed.count() * 1e-9);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <iomanip>
|
||||
#include <chrono>
|
||||
#include <cuda_runtime.h>
|
||||
#include <nvml.h>
|
||||
@@ -15,51 +15,48 @@ const std::string curve = "BN254";
|
||||
#define MAX_THREADS_PER_BLOCK 256
|
||||
|
||||
template <typename T>
|
||||
__global__ void vectorMult(T *vec_a, T *vec_b, T *vec_r, size_t n_elments)
|
||||
__global__ void vectorMult(T* vec_a, T* vec_b, T* vec_r, size_t n_elments)
|
||||
{
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid < n_elments)
|
||||
{
|
||||
vec_r[tid] = vec_a[tid] * vec_b[tid];
|
||||
}
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid < n_elments) { vec_r[tid] = vec_a[tid] * vec_b[tid]; }
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
int vector_mult(T *vec_b, T *vec_a, T *vec_result, size_t n_elments)
|
||||
int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments)
|
||||
{
|
||||
// Set the grid and block dimensions
|
||||
int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
|
||||
int threads_per_block = MAX_THREADS_PER_BLOCK;
|
||||
// Set the grid and block dimensions
|
||||
int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
|
||||
int threads_per_block = MAX_THREADS_PER_BLOCK;
|
||||
|
||||
// Call the kernel to perform element-wise modular multiplication
|
||||
vectorMult<T><<<num_blocks, threads_per_block>>>(vec_a, vec_b, vec_result, n_elments);
|
||||
return 0;
|
||||
// Call the kernel to perform element-wise modular multiplication
|
||||
vectorMult<T><<<num_blocks, threads_per_block>>>(vec_a, vec_b, vec_result, n_elments);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
const unsigned vector_size = 1 << 20;
|
||||
const unsigned repetitions = 1 << 20;
|
||||
const unsigned repetitions = 1 << 20;
|
||||
|
||||
cudaError_t err;
|
||||
nvmlInit();
|
||||
nvmlDevice_t device;
|
||||
nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0
|
||||
nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0
|
||||
std::cout << "Icicle-Examples: vector multiplications" << std::endl;
|
||||
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
|
||||
if (nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE) == NVML_SUCCESS) {
|
||||
std::cout << "GPU Model: " << name << std::endl;
|
||||
} else {
|
||||
std::cerr << "Failed to get GPU model name." << std::endl;
|
||||
}
|
||||
std::cout << "GPU Model: " << name << std::endl;
|
||||
} else {
|
||||
std::cerr << "Failed to get GPU model name." << std::endl;
|
||||
}
|
||||
|
||||
unsigned power_limit;
|
||||
nvmlDeviceGetPowerManagementLimit(device, &power_limit);
|
||||
|
||||
|
||||
std::cout << "Vector size: " << vector_size << std::endl;
|
||||
std::cout << "Repetitions: " << repetitions << std::endl;
|
||||
std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
|
||||
|
||||
|
||||
unsigned int baseline_power;
|
||||
nvmlDeviceGetPowerUsage(device, &baseline_power);
|
||||
std::cout << "Baseline power: " << std::fixed << std::setprecision(3) << 1.0e-3 * baseline_power << " W" << std::endl;
|
||||
@@ -75,8 +72,7 @@ int main(int argc, char** argv)
|
||||
T* host_in2 = (T*)malloc(vector_size * sizeof(T));
|
||||
std::cout << "Initializing vectors with random data" << std::endl;
|
||||
for (int i = 0; i < vector_size; i++) {
|
||||
if ( (i>0) && i % (1<<20) == 0)
|
||||
std::cout << "Elements: " << i << std::endl;
|
||||
if ((i > 0) && i % (1 << 20) == 0) std::cout << "Elements: " << i << std::endl;
|
||||
host_in1[i] = T::rand_host();
|
||||
host_in2[i] = T::rand_host();
|
||||
}
|
||||
@@ -89,40 +85,39 @@ int main(int argc, char** argv)
|
||||
err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// copy from host to device
|
||||
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
std::cout << "Starting warm-up" << std::endl;
|
||||
// Warm-up loop
|
||||
for (int i = 0; i < repetitions; i++) {
|
||||
@@ -138,7 +133,8 @@ int main(int argc, char** argv)
|
||||
unsigned power_before;
|
||||
nvmlDeviceGetPowerUsage(device, &power_before);
|
||||
std::cout << "Power before: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_before << " W" << std::endl;
|
||||
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float) 100.0 * power_before / power_limit << " %" << std::endl;
|
||||
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_before / power_limit
|
||||
<< " %" << std::endl;
|
||||
unsigned temperature_before;
|
||||
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_before) == NVML_SUCCESS) {
|
||||
std::cout << "GPU Temperature before: " << temperature_before << " C" << std::endl;
|
||||
@@ -156,14 +152,15 @@ int main(int argc, char** argv)
|
||||
// return 0;
|
||||
// }
|
||||
}
|
||||
|
||||
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
|
||||
std::cout << "Elapsed time: " << duration.count() << " microseconds" << std::endl;
|
||||
unsigned power_after;
|
||||
nvmlDeviceGetPowerUsage(device, &power_after);
|
||||
std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
|
||||
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float) 100.0 * power_after / power_limit << " %" << std::endl;
|
||||
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
|
||||
<< " %" << std::endl;
|
||||
unsigned temperature_after;
|
||||
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
|
||||
std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
|
||||
@@ -172,19 +169,19 @@ int main(int argc, char** argv)
|
||||
}
|
||||
|
||||
// Report performance in GMPS: Giga Multiplications Per Second
|
||||
double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count()) ;
|
||||
double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
|
||||
std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
|
||||
|
||||
// Optional: validate multiplication
|
||||
T * host_out = (T*)malloc(vector_size * sizeof(T));
|
||||
T* host_out = (T*)malloc(vector_size * sizeof(T));
|
||||
|
||||
cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
|
||||
|
||||
// validate multiplication here...
|
||||
|
||||
free(host_in1);
|
||||
free(host_in1);
|
||||
free(host_in2);
|
||||
free(host_out);
|
||||
free(host_out);
|
||||
cudaFree(device_in1);
|
||||
cudaFree(device_in2);
|
||||
cudaFree(device_out);
|
||||
@@ -192,5 +189,4 @@ int main(int argc, char** argv)
|
||||
nvmlShutdown();
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
@@ -1,7 +1,6 @@
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
|
||||
|
||||
// include NTT template
|
||||
#include "icicle/appUtils/ntt/ntt.cuh"
|
||||
|
||||
@@ -13,29 +12,32 @@ using namespace BLS12_381;
|
||||
typedef scalar_t S;
|
||||
typedef scalar_t E;
|
||||
|
||||
scalar_t smult(const unsigned n, scalar_t s) {
|
||||
scalar_t smult(const unsigned n, scalar_t s)
|
||||
{
|
||||
scalar_t r = scalar_t::zero();
|
||||
for (unsigned i=0; i < n; i++) {
|
||||
for (unsigned i = 0; i < n; i++) {
|
||||
r = r + s;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
|
||||
void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
|
||||
{
|
||||
// Harmonics 0
|
||||
for (unsigned i = 0; i < ntt_size; i=i+1) {
|
||||
for (unsigned i = 0; i < ntt_size; i = i + 1) {
|
||||
elements[i] = scalar_t::one();
|
||||
}
|
||||
// Harmonics 1
|
||||
for (unsigned i = 1*ntt_size; i < 2*ntt_size; i=i+2) {
|
||||
for (unsigned i = 1 * ntt_size; i < 2 * ntt_size; i = i + 2) {
|
||||
elements[i] = scalar_t::one();
|
||||
elements[i+1] = scalar_t::neg(scalar_t::one());
|
||||
elements[i + 1] = scalar_t::neg(scalar_t::one());
|
||||
}
|
||||
}
|
||||
|
||||
int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
|
||||
int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
|
||||
{
|
||||
int nof_errors = 0;
|
||||
E amplitude = smult(ntt_size,scalar_t::one());
|
||||
E amplitude = smult(ntt_size, scalar_t::one());
|
||||
// std::cout << "Amplitude: " << amplitude << std::endl;
|
||||
// Harmonics 0
|
||||
if (elements[0] != amplitude) {
|
||||
@@ -45,14 +47,14 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E * elemen
|
||||
std::cout << "Validated harmonics 0" << std::endl;
|
||||
}
|
||||
// Harmonics 1
|
||||
if (elements[ntt_size+1] != amplitude) {
|
||||
if (elements[ntt_size + 1] != amplitude) {
|
||||
++nof_errors;
|
||||
std::cout << "Error in harmonics 1: " << elements[ntt_size+1] << std::endl;
|
||||
std::cout << "Error in harmonics 1: " << elements[ntt_size + 1] << std::endl;
|
||||
} else {
|
||||
std::cout << "Validated harmonics 1" << std::endl;
|
||||
}
|
||||
// for (unsigned i = 0; i < nof_ntts * ntt_size; i++) {
|
||||
// std::cout << elements[i] << std::endl;
|
||||
// std::cout << elements[i] << std::endl;
|
||||
// }
|
||||
return nof_errors;
|
||||
}
|
||||
@@ -68,35 +70,35 @@ int main(int argc, char* argv[])
|
||||
const unsigned nof_ntts = 2;
|
||||
std::cout << "Number of NTTs: " << nof_ntts << std::endl;
|
||||
const unsigned batch_size = nof_ntts * ntt_size;
|
||||
|
||||
|
||||
std::cout << "Generating input data for harmonics 0,1" << std::endl;
|
||||
E* elements;
|
||||
elements = (scalar_t*) malloc(sizeof(E) * batch_size);
|
||||
initialize_input(ntt_size, nof_ntts, elements );
|
||||
|
||||
elements = (scalar_t*)malloc(sizeof(E) * batch_size);
|
||||
initialize_input(ntt_size, nof_ntts, elements);
|
||||
|
||||
std::cout << "Running easy-to-use NTT" << std::endl;
|
||||
cudaStream_t stream;
|
||||
cudaStreamCreate(&stream);
|
||||
|
||||
|
||||
bool inverse = false;
|
||||
auto begin0 = std::chrono::high_resolution_clock::now();
|
||||
ntt_end2end_batch_template<scalar_t, scalar_t>(elements, batch_size, ntt_size, inverse, stream);
|
||||
auto end0 = std::chrono::high_resolution_clock::now();
|
||||
auto elapsed0 = std::chrono::duration_cast<std::chrono::nanoseconds>(end0 - begin0);
|
||||
printf("On-device runtime: %.3f seconds\n", elapsed0.count() * 1e-9);
|
||||
validate_output(ntt_size, nof_ntts, elements );
|
||||
validate_output(ntt_size, nof_ntts, elements);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
std::cout << "Running not that easy-to-use but fast NTT" << std::endl;
|
||||
|
||||
|
||||
uint32_t n_twiddles = ntt_size; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order.
|
||||
// represent transform matrix using twiddle factors
|
||||
scalar_t * d_twiddles;
|
||||
scalar_t* d_twiddles;
|
||||
d_twiddles = fill_twiddle_factors_array(n_twiddles, scalar_t::omega(log_ntt_size), stream); // Sscalar
|
||||
scalar_t* d_elements; // Element
|
||||
|
||||
scalar_t* d_elements; // Element
|
||||
|
||||
cudaMallocAsync(&d_elements, sizeof(scalar_t) * batch_size, stream);
|
||||
initialize_input(ntt_size, nof_ntts, elements );
|
||||
initialize_input(ntt_size, nof_ntts, elements);
|
||||
cudaMemcpyAsync(d_elements, elements, sizeof(scalar_t) * batch_size, cudaMemcpyHostToDevice, stream);
|
||||
S* _null = nullptr;
|
||||
auto begin1 = std::chrono::high_resolution_clock::now();
|
||||
@@ -106,12 +108,12 @@ int main(int argc, char* argv[])
|
||||
auto end1 = std::chrono::high_resolution_clock::now();
|
||||
auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
|
||||
printf("Runtime: %.3e seconds\n", elapsed1.count() * 1e-9);
|
||||
|
||||
|
||||
cudaMemcpyAsync(elements, d_elements, sizeof(E) * batch_size, cudaMemcpyDeviceToHost, stream);
|
||||
validate_output(ntt_size, nof_ntts, elements );
|
||||
validate_output(ntt_size, nof_ntts, elements);
|
||||
cudaFreeAsync(d_elements, stream);
|
||||
cudaFreeAsync(d_twiddles, stream);
|
||||
|
||||
|
||||
cudaStreamDestroy(stream);
|
||||
|
||||
free(elements);
|
||||
|
||||
@@ -64,6 +64,7 @@ project(icicle LANGUAGES CUDA CXX)
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
|
||||
set(CMAKE_CUDA_FLAGS_RELEASE "")
|
||||
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
|
||||
include_directories("${CMAKE_SOURCE_DIR}")
|
||||
|
||||
|
||||
# when adding a new curve/field, append its name to the end of this list
|
||||
@@ -98,35 +99,8 @@ if (NOT BUILD_TESTS)
|
||||
appUtils/msm/msm.cu
|
||||
appUtils/ntt/ntt.cu
|
||||
)
|
||||
#set_target_properties(icicle PROPERTIES CUDA_SEPARABLE_COMPILATION ON) #TODO: enable asap
|
||||
#set_target_properties(icicle PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
target_compile_options(icicle PRIVATE -c)
|
||||
|
||||
add_custom_command(
|
||||
TARGET icicle
|
||||
POST_BUILD
|
||||
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym MSMCuda=${CURVE}MSMCuda ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
|
||||
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym DefaultMSMConfig=${CURVE}DefaultMSMConfig ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
|
||||
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym NTTCuda=${CURVE}NTTCuda ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
|
||||
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym GetDefaultNTTConfig=${CURVE}DefaultNTTConfig ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
|
||||
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym InitializeDomain=${CURVE}InitializeDomain ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
|
||||
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym GenerateScalars=${CURVE}GenerateScalars ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/field.cu.o
|
||||
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym Eq=${CURVE}Eq ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
|
||||
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym ToAffine=${CURVE}ToAffine ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
|
||||
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym GenerateProjectivePoints=${CURVE}GenerateProjectivePoints ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
|
||||
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym GenerateAffinePoints=${CURVE}GenerateAffinePoints ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
|
||||
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym ScalarConvertMontgomery=${CURVE}ScalarConvertMontgomery ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/mont.cu.o
|
||||
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym AffineConvertMontgomery=${CURVE}AffineConvertMontgomery ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/mont.cu.o
|
||||
COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym ProjectiveConvertMontgomery=${CURVE}ProjectiveConvertMontgomery ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/mont.cu.o
|
||||
COMMAND ${CMAKE_AR} ARGS -rcs ${PROJECT_BINARY_DIR}/libingo_${CURVE}.a
|
||||
${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/vec_ops.cu.o
|
||||
${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/field.cu.o
|
||||
${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/primitives/projective.cu.o
|
||||
${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
|
||||
${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
|
||||
${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/utils/mont.cu.o
|
||||
|
||||
)
|
||||
set_target_properties(icicle PROPERTIES OUTPUT_NAME "ingo_${CURVE}")
|
||||
target_compile_definitions(icicle PRIVATE CURVE=${CURVE})
|
||||
|
||||
else()
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
test_msm:
|
||||
mkdir -p work
|
||||
nvcc -o work/test_msm -I. tests/msm_test.cu
|
||||
nvcc -o work/test_msm -I. -I../.. tests/msm_test.cu
|
||||
work/test_msm
|
||||
@@ -10,13 +10,14 @@
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
#include "../../curves/curve_config.cuh"
|
||||
#include "../../primitives/affine.cuh"
|
||||
#include "../../primitives/field.cuh"
|
||||
#include "../../primitives/projective.cuh"
|
||||
#include "../../utils/cuda_utils.cuh"
|
||||
#include "../../utils/error_handler.cuh"
|
||||
#include "../../utils/mont.cuh"
|
||||
#include "curves/curve_config.cuh"
|
||||
#include "primitives/affine.cuh"
|
||||
#include "primitives/field.cuh"
|
||||
#include "primitives/projective.cuh"
|
||||
#include "utils/cuda_utils.cuh"
|
||||
#include "utils/error_handler.cuh"
|
||||
#include "utils/mont.cuh"
|
||||
#include "utils/utils.h"
|
||||
|
||||
namespace msm {
|
||||
|
||||
@@ -966,7 +967,7 @@ namespace msm {
|
||||
|
||||
} // namespace
|
||||
|
||||
extern "C" MSMConfig DefaultMSMConfig()
|
||||
extern "C" MSMConfig CONCAT_EXPAND(CURVE, DefaultMSMConfig)()
|
||||
{
|
||||
device_context::DeviceContext ctx = device_context::get_default_device_context();
|
||||
MSMConfig config = {
|
||||
@@ -1016,7 +1017,7 @@ namespace msm {
|
||||
* - `P` is the [projective representation](@ref projective_t) of curve points.
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t MSMCuda(
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, MSMCuda)(
|
||||
curve_config::scalar_t* scalars,
|
||||
curve_config::affine_t* points,
|
||||
int msm_size,
|
||||
|
||||
@@ -3,9 +3,10 @@
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "../../curves/curve_config.cuh"
|
||||
#include "../../utils/sharedmem.cuh"
|
||||
#include "../../utils/utils_kernels.cuh"
|
||||
#include "curves/curve_config.cuh"
|
||||
#include "utils/sharedmem.cuh"
|
||||
#include "utils/utils_kernels.cuh"
|
||||
#include "utils/utils.h"
|
||||
|
||||
namespace ntt {
|
||||
|
||||
@@ -508,7 +509,7 @@ namespace ntt {
|
||||
* - `S` is the [scalar field](@ref scalar_t) of the curve;
|
||||
* @return Default [NTTConfig](@ref NTTConfig).
|
||||
*/
|
||||
extern "C" NTTConfig<curve_config::scalar_t> GetDefaultNTTConfig()
|
||||
extern "C" NTTConfig<curve_config::scalar_t> CONCAT_EXPAND(CURVE, GetDefaultNTTConfig)()
|
||||
{
|
||||
return DefaultNTTConfig<curve_config::scalar_t>();
|
||||
}
|
||||
@@ -518,7 +519,8 @@ namespace ntt {
|
||||
* value of template parameter (where the curve is given by `-DCURVE` env variable during build):
|
||||
* - `S` is the [scalar field](@ref scalar_t) of the curve;
|
||||
*/
|
||||
extern "C" cudaError_t InitializeDomain(curve_config::scalar_t primitive_root, device_context::DeviceContext& ctx)
|
||||
extern "C" cudaError_t
|
||||
CONCAT_EXPAND(CURVE, InitializeDomain)(curve_config::scalar_t primitive_root, device_context::DeviceContext& ctx)
|
||||
{
|
||||
return InitDomain(primitive_root, ctx);
|
||||
}
|
||||
@@ -529,7 +531,7 @@ namespace ntt {
|
||||
* - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve;
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t NTTCuda(
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, NTTCuda)(
|
||||
curve_config::scalar_t* input,
|
||||
int size,
|
||||
NTTDir dir,
|
||||
|
||||
@@ -4,11 +4,12 @@
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "../../curves/curve_config.cuh"
|
||||
#include "../../utils/device_context.cuh"
|
||||
#include "../../utils/error_handler.cuh"
|
||||
#include "../../utils/sharedmem.cuh"
|
||||
#include "../../utils/utils_kernels.cuh"
|
||||
#include "curves/curve_config.cuh"
|
||||
#include "utils/device_context.cuh"
|
||||
#include "utils/error_handler.cuh"
|
||||
#include "utils/sharedmem.cuh"
|
||||
#include "utils/utils_kernels.cuh"
|
||||
#include "utils/utils.h"
|
||||
|
||||
/**
|
||||
* @namespace ntt
|
||||
@@ -86,7 +87,7 @@ namespace ntt {
|
||||
* @return Default value of [NTTConfig](@ref NTTConfig).
|
||||
*/
|
||||
template <typename S>
|
||||
NTTConfig<S> DefaultNTTConfig();
|
||||
NTTConfig<S> CONCAT_EXPAND(CURVE, DefaultNTTConfig)();
|
||||
|
||||
/**
|
||||
* A function that computes NTT or iNTT in-place.
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
#include "../curves/curve_config.cuh"
|
||||
#include "curves/curve_config.cuh"
|
||||
#include "field.cuh"
|
||||
#include "utils/utils.h"
|
||||
|
||||
#define scalar_t curve_config::scalar_t
|
||||
|
||||
extern "C" void GenerateScalars(scalar_t* scalars, int size) { scalar_t::RandHostMany(scalars, size); }
|
||||
extern "C" void CONCAT_EXPAND(CURVE, GenerateScalars)(scalar_t* scalars, int size)
|
||||
{
|
||||
scalar_t::RandHostMany(scalars, size);
|
||||
}
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
#include "../curves/curve_config.cuh"
|
||||
#include "curves/curve_config.cuh"
|
||||
#include "projective.cuh"
|
||||
#include <cuda.h>
|
||||
#include "utils/utils.h"
|
||||
|
||||
#define projective_t curve_config::projective_t // TODO: global to avoid lengthy texts
|
||||
#define affine_t curve_config::affine_t
|
||||
#define point_field_t curve_config::point_field_t
|
||||
|
||||
extern "C" bool Eq(projective_t* point1, projective_t* point2)
|
||||
extern "C" bool CONCAT_EXPAND(CURVE, Eq)(projective_t* point1, projective_t* point2)
|
||||
{
|
||||
return (*point1 == *point2) &&
|
||||
!((point1->x == point_field_t::zero()) && (point1->y == point_field_t::zero()) &&
|
||||
@@ -15,11 +16,20 @@ extern "C" bool Eq(projective_t* point1, projective_t* point2)
|
||||
(point2->z == point_field_t::zero()));
|
||||
}
|
||||
|
||||
extern "C" void ToAffine(projective_t* point, affine_t* point_out) { *point_out = projective_t::to_affine(*point); }
|
||||
extern "C" void CONCAT_EXPAND(CURVE, ToAffine)(projective_t* point, affine_t* point_out)
|
||||
{
|
||||
*point_out = projective_t::to_affine(*point);
|
||||
}
|
||||
|
||||
extern "C" void GenerateProjectivePoints(projective_t* points, int size) { projective_t::RandHostMany(points, size); }
|
||||
extern "C" void CONCAT_EXPAND(CURVE, GenerateProjectivePoints)(projective_t* points, int size)
|
||||
{
|
||||
projective_t::RandHostMany(points, size);
|
||||
}
|
||||
|
||||
extern "C" void GenerateAffinePoints(affine_t* points, int size) { projective_t::RandHostManyAffine(points, size); }
|
||||
extern "C" void CONCAT_EXPAND(CURVE, GenerateAffinePoints)(affine_t* points, int size)
|
||||
{
|
||||
projective_t::RandHostManyAffine(points, size);
|
||||
}
|
||||
|
||||
#if defined(G2_DEFINED)
|
||||
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
#include "../curves/curve_config.cuh"
|
||||
#include "curves/curve_config.cuh"
|
||||
#include "device_context.cuh"
|
||||
#include "mont.cuh"
|
||||
#include "utils/utils.h"
|
||||
|
||||
namespace mont {
|
||||
extern "C" cudaError_t
|
||||
ScalarConvertMontgomery(curve_config::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, ScalarConvertMontgomery)(
|
||||
curve_config::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
|
||||
{
|
||||
if (is_into) {
|
||||
return ToMontgomery(d_inout, n, ctx.stream, d_inout);
|
||||
@@ -13,8 +14,8 @@ namespace mont {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" cudaError_t
|
||||
AffineConvertMontgomery(curve_config::affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, AffineConvertMontgomery)(
|
||||
curve_config::affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
|
||||
{
|
||||
if (is_into) {
|
||||
return ToMontgomery(d_inout, n, ctx.stream, d_inout);
|
||||
@@ -23,7 +24,7 @@ namespace mont {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" cudaError_t ProjectiveConvertMontgomery(
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, ProjectiveConvertMontgomery)(
|
||||
curve_config::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
|
||||
{
|
||||
if (is_into) {
|
||||
|
||||
8
icicle/utils/utils.h
Normal file
8
icicle/utils/utils.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#pragma once
|
||||
#ifndef ICICLE_UTILS_H
|
||||
#define ICICLE_UTILS_H
|
||||
|
||||
#define CONCAT_DIRECT(a, b) a##b
|
||||
#define CONCAT_EXPAND(a, b) CONCAT_DIRECT(a, b) // expand a,b before concatenation
|
||||
|
||||
#endif // ICICLE_UTILS_H
|
||||
@@ -149,7 +149,7 @@ macro_rules! impl_ntt {
|
||||
output: *mut $field,
|
||||
) -> CudaError;
|
||||
|
||||
#[link_name = concat!($field_prefix, "DefaultNTTConfig")]
|
||||
#[link_name = concat!($field_prefix, "GetDefaultNTTConfig")]
|
||||
fn default_ntt_config() -> NTTConfig<'static, $field>;
|
||||
|
||||
#[link_name = concat!($field_prefix, "InitializeDomain")]
|
||||
|
||||
Reference in New Issue
Block a user