mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-08 23:17:54 -05:00
Mixed-radix NTT algorithm
Co-authored-by: hadaringonyama <hadar@ingonyama.com>
This commit is contained in:
@@ -5,28 +5,31 @@
|
||||
#define CURVE_ID 1
|
||||
// include NTT template
|
||||
#include "appUtils/ntt/ntt.cu"
|
||||
#include "appUtils/ntt/kernel_ntt.cu"
|
||||
using namespace curve_config;
|
||||
|
||||
// Operate on scalars
|
||||
typedef scalar_t S;
|
||||
typedef scalar_t E;
|
||||
|
||||
void print_elements(const unsigned n, E * elements ) {
|
||||
void print_elements(const unsigned n, E* elements)
|
||||
{
|
||||
for (unsigned i = 0; i < n; i++) {
|
||||
std::cout << i << ": " << elements[i] << std::endl;
|
||||
std::cout << i << ": " << elements[i] << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
|
||||
void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
|
||||
{
|
||||
// Lowest Harmonics
|
||||
for (unsigned i = 0; i < ntt_size; i=i+1) {
|
||||
for (unsigned i = 0; i < ntt_size; i = i + 1) {
|
||||
elements[i] = E::one();
|
||||
}
|
||||
// print_elements(ntt_size, elements );
|
||||
// Highest Harmonics
|
||||
for (unsigned i = 1*ntt_size; i < 2*ntt_size; i=i+2) {
|
||||
elements[i] = E::one();
|
||||
elements[i+1] = E::neg(scalar_t::one());
|
||||
for (unsigned i = 1 * ntt_size; i < 2 * ntt_size; i = i + 2) {
|
||||
elements[i] = E::one();
|
||||
elements[i + 1] = E::neg(scalar_t::one());
|
||||
}
|
||||
// print_elements(ntt_size, &elements[1*ntt_size] );
|
||||
}
|
||||
@@ -34,7 +37,7 @@ void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E * elem
|
||||
int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
|
||||
{
|
||||
int nof_errors = 0;
|
||||
E amplitude = E::from((uint32_t) ntt_size);
|
||||
E amplitude = E::from((uint32_t)ntt_size);
|
||||
// std::cout << "Amplitude: " << amplitude << std::endl;
|
||||
// Lowest Harmonics
|
||||
if (elements[0] != amplitude) {
|
||||
@@ -44,8 +47,8 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* element
|
||||
} else {
|
||||
std::cout << "Validated lowest harmonics" << std::endl;
|
||||
}
|
||||
// Highest Harmonics
|
||||
if (elements[1*ntt_size+ntt_size/2] != amplitude) {
|
||||
// Highest Harmonics
|
||||
if (elements[1 * ntt_size + ntt_size / 2] != amplitude) {
|
||||
++nof_errors;
|
||||
std::cout << "Error in highest harmonics! " << std::endl;
|
||||
// print_elements(ntt_size, &elements[1*ntt_size] );
|
||||
@@ -66,24 +69,24 @@ int main(int argc, char* argv[])
|
||||
const unsigned nof_ntts = 2;
|
||||
std::cout << "Number of NTTs: " << nof_ntts << std::endl;
|
||||
const unsigned batch_size = nof_ntts * ntt_size;
|
||||
|
||||
|
||||
std::cout << "Generating input data for lowest and highest harmonics" << std::endl;
|
||||
E* input;
|
||||
input = (E*) malloc(sizeof(E) * batch_size);
|
||||
initialize_input(ntt_size, nof_ntts, input );
|
||||
input = (E*)malloc(sizeof(E) * batch_size);
|
||||
initialize_input(ntt_size, nof_ntts, input);
|
||||
E* output;
|
||||
output = (E*) malloc(sizeof(E) * batch_size);
|
||||
|
||||
output = (E*)malloc(sizeof(E) * batch_size);
|
||||
|
||||
std::cout << "Running NTT with on-host data" << std::endl;
|
||||
cudaStream_t stream;
|
||||
cudaStreamCreate(&stream);
|
||||
// Create a device context
|
||||
auto ctx = device_context::get_default_device_context();
|
||||
// the next line is valid only for CURVE_ID 1 (will add support for other curves soon)
|
||||
S rou = S{ {0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x6d1e303, 0x27508aba, 0xa0ed063, 0x26125da1} };
|
||||
S rou = S{{0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x6d1e303, 0x27508aba, 0xa0ed063, 0x26125da1}};
|
||||
ntt::InitDomain(rou, ctx);
|
||||
// Create an NTTConfig instance
|
||||
ntt::NTTConfig<S> config=ntt::DefaultNTTConfig<S>();
|
||||
ntt::NTTConfig<S> config = ntt::DefaultNTTConfig<S>();
|
||||
config.batch_size = nof_ntts;
|
||||
config.ctx.stream = stream;
|
||||
auto begin0 = std::chrono::high_resolution_clock::now();
|
||||
@@ -91,7 +94,7 @@ int main(int argc, char* argv[])
|
||||
auto end0 = std::chrono::high_resolution_clock::now();
|
||||
auto elapsed0 = std::chrono::duration_cast<std::chrono::nanoseconds>(end0 - begin0);
|
||||
printf("On-device runtime: %.3f seconds\n", elapsed0.count() * 1e-9);
|
||||
validate_output(ntt_size, nof_ntts, output );
|
||||
validate_output(ntt_size, nof_ntts, output);
|
||||
cudaStreamDestroy(stream);
|
||||
free(input);
|
||||
free(output);
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
# Make sure NVIDIA Container Toolkit is installed on your host
|
||||
|
||||
# Use the specified base image
|
||||
FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
|
||||
|
||||
# Update and install dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
cmake \
|
||||
curl \
|
||||
build-essential \
|
||||
git \
|
||||
libboost-all-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Clone Icicle from a GitHub repository
|
||||
RUN git clone https://github.com/ingonyama-zk/icicle.git /icicle
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /icicle-example
|
||||
|
||||
# Specify the default command for the container
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "Icicle Examples: polynomial multiplication",
|
||||
"build": {
|
||||
"dockerfile": "Dockerfile"
|
||||
},
|
||||
"runArgs": [
|
||||
"--gpus",
|
||||
"all"
|
||||
],
|
||||
"postCreateCommand": [
|
||||
"nvidia-smi"
|
||||
],
|
||||
"customizations": {
|
||||
"vscode": {
|
||||
"extensions": [
|
||||
"ms-vscode.cmake-tools",
|
||||
"ms-python.python",
|
||||
"ms-vscode.cpptools"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
26
examples/c++/polynomial_multiplication/CMakeLists.txt
Normal file
26
examples/c++/polynomial_multiplication/CMakeLists.txt
Normal file
@@ -0,0 +1,26 @@
|
||||
cmake_minimum_required(VERSION 3.18)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CUDA_STANDARD 17)
|
||||
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
|
||||
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
|
||||
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
|
||||
endif ()
|
||||
project(icicle LANGUAGES CUDA CXX)
|
||||
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
|
||||
set(CMAKE_CUDA_FLAGS_RELEASE "")
|
||||
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
|
||||
# change the path to your Icicle location
|
||||
include_directories("../../../icicle")
|
||||
add_executable(
|
||||
example
|
||||
example.cu
|
||||
)
|
||||
|
||||
find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
|
||||
target_link_libraries(example ${NVML_LIBRARY})
|
||||
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
||||
|
||||
11
examples/c++/polynomial_multiplication/compile.sh
Executable file
11
examples/c++/polynomial_multiplication/compile.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Exit immediately on error
|
||||
set -e
|
||||
|
||||
rm -rf build
|
||||
mkdir -p build
|
||||
cmake -S . -B build
|
||||
cmake --build build
|
||||
|
||||
|
||||
114
examples/c++/polynomial_multiplication/example.cu
Normal file
114
examples/c++/polynomial_multiplication/example.cu
Normal file
@@ -0,0 +1,114 @@
|
||||
#define CURVE_ID BLS12_381
|
||||
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "curves/curve_config.cuh"
|
||||
#include "appUtils/ntt/ntt.cu"
|
||||
#include "appUtils/ntt/kernel_ntt.cu"
|
||||
#include "utils/vec_ops.cu"
|
||||
#include "utils/error_handler.cuh"
|
||||
#include <memory>
|
||||
|
||||
typedef curve_config::scalar_t test_scalar;
|
||||
typedef curve_config::scalar_t test_data;
|
||||
|
||||
void random_samples(test_data* res, uint32_t count)
|
||||
{
|
||||
for (int i = 0; i < count; i++)
|
||||
res[i] = i < 1000 ? test_data::rand_host() : res[i - 1000];
|
||||
}
|
||||
|
||||
void incremental_values(test_scalar* res, uint32_t count)
|
||||
{
|
||||
for (int i = 0; i < count; i++) {
|
||||
res[i] = i ? res[i - 1] + test_scalar::one() * test_scalar::omega(4) : test_scalar::zero();
|
||||
}
|
||||
}
|
||||
|
||||
// calcaulting polynomial multiplication A*B via NTT,pointwise-multiplication and INTT
|
||||
// (1) allocate A,B on CPU. Randomize first half, zero second half
|
||||
// (2) allocate NttAGpu, NttBGpu on GPU
|
||||
// (3) calc NTT for A and for B from cpu to GPU
|
||||
// (4) multiply MulGpu = NttAGpu * NttBGpu (pointwise)
|
||||
// (5) INTT MulGpu inplace
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
float measured_time;
|
||||
|
||||
int NTT_LOG_SIZE = 23;
|
||||
int NTT_SIZE = 1 << NTT_LOG_SIZE;
|
||||
|
||||
CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context
|
||||
|
||||
// init domain
|
||||
auto ntt_config = ntt::DefaultNTTConfig<test_scalar>();
|
||||
ntt_config.ordering = ntt::Ordering::kNN; // TODO: use NR for forward and RN for backward
|
||||
ntt_config.is_force_radix2 = (argc > 1) ? atoi(argv[1]) : false;
|
||||
|
||||
const char* ntt_alg_str = ntt_config.is_force_radix2 ? "Radix-2" : "Mixed-Radix";
|
||||
std::cout << "Polynomial multiplication with " << ntt_alg_str << " NTT: ";
|
||||
|
||||
CHK_IF_RETURN(cudaEventCreate(&start));
|
||||
CHK_IF_RETURN(cudaEventCreate(&stop));
|
||||
|
||||
const test_scalar basic_root = test_scalar::omega(NTT_LOG_SIZE);
|
||||
ntt::InitDomain(basic_root, ntt_config.ctx);
|
||||
|
||||
// (1) cpu allocation
|
||||
auto CpuA = std::make_unique<test_data[]>(NTT_SIZE);
|
||||
auto CpuB = std::make_unique<test_data[]>(NTT_SIZE);
|
||||
random_samples(CpuA.get(), NTT_SIZE >> 1); // second half zeros
|
||||
random_samples(CpuB.get(), NTT_SIZE >> 1); // second half zeros
|
||||
|
||||
test_data *GpuA, *GpuB, *MulGpu;
|
||||
|
||||
auto benchmark = [&](bool print, int iterations = 1) {
|
||||
// start recording
|
||||
CHK_IF_RETURN(cudaEventRecord(start, ntt_config.ctx.stream));
|
||||
|
||||
for (int iter = 0; iter < iterations; ++iter) {
|
||||
// (2) gpu input allocation
|
||||
CHK_IF_RETURN(cudaMallocAsync(&GpuA, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaMallocAsync(&GpuB, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
|
||||
|
||||
// (3) NTT for A,B from cpu to gpu
|
||||
ntt_config.are_inputs_on_device = false;
|
||||
ntt_config.are_outputs_on_device = true;
|
||||
CHK_IF_RETURN(ntt::NTT(CpuA.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuA));
|
||||
CHK_IF_RETURN(ntt::NTT(CpuB.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuB));
|
||||
|
||||
// (4) multiply A,B
|
||||
CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
|
||||
CHK_IF_RETURN(
|
||||
vec_ops::Mul(GpuA, GpuB, NTT_SIZE, true /*=is_on_device*/, false /*=is_montgomery*/, ntt_config.ctx, MulGpu));
|
||||
|
||||
// (5) INTT (in place)
|
||||
ntt_config.are_inputs_on_device = true;
|
||||
ntt_config.are_outputs_on_device = true;
|
||||
CHK_IF_RETURN(ntt::NTT(MulGpu, NTT_SIZE, ntt::NTTDir::kInverse, ntt_config, MulGpu));
|
||||
|
||||
CHK_IF_RETURN(cudaFreeAsync(GpuA, ntt_config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaFreeAsync(GpuB, ntt_config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaFreeAsync(MulGpu, ntt_config.ctx.stream));
|
||||
}
|
||||
|
||||
CHK_IF_RETURN(cudaEventRecord(stop, ntt_config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaEventElapsedTime(&measured_time, start, stop));
|
||||
|
||||
if (print) { std::cout << measured_time / iterations << " MS" << std::endl; }
|
||||
|
||||
return CHK_LAST();
|
||||
};
|
||||
|
||||
benchmark(false); // warmup
|
||||
benchmark(true, 20);
|
||||
|
||||
CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
|
||||
|
||||
return 0;
|
||||
}
|
||||
3
examples/c++/polynomial_multiplication/run.sh
Executable file
3
examples/c++/polynomial_multiplication/run.sh
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
./build/example 1 # radix2
|
||||
./build/example 0 # mixed-radix
|
||||
@@ -53,7 +53,7 @@ struct Args {
|
||||
lower_bound_log_size: u8,
|
||||
|
||||
/// Upper bound of MSM sizes to run for
|
||||
#[arg(short, long, default_value_t = 23)]
|
||||
#[arg(short, long, default_value_t = 22)]
|
||||
upper_bound_log_size: u8,
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user