Stas/best-practice-ntt (#517)

## Describe the changes

Icicle examples:  Concurrent Data Transfer and NTT Computation

This PR introduces a Best Practice series of examples in c++.
Specifically, the example shows how to concurrently transfer data
to/from device and execute NTT

## Linked Issues

Resolves #
This commit is contained in:
Stas
2024-05-16 14:51:49 -06:00
committed by GitHub
parent 4496520a10
commit 02059fcfaa
5 changed files with 216 additions and 0 deletions

View File

@@ -0,0 +1,23 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(example LANGUAGES CUDA CXX)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
add_executable(
example
example.cu
)
target_include_directories(example PRIVATE "../../../icicle/include")
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

View File

@@ -0,0 +1,33 @@
# ICICLE best practices: Concurrent Data Transfer and NTT Computation
The [Number Theoretic Transform (NTT)](https://dev.ingonyama.com/icicle/primitives/ntt) is an integral component of many cryptographic algorithms, such as polynomial multiplication in Zero Knowledge Proofs. The performance bottleneck of NTT on GPUs is the data transfer between the host (CPU) and the device (GPU). In a typical NVIDIA GPU this transfer dominates the total NTT execution time.
## Key-Takeaway
When you have to run several NTTs, consider Concurrent Data Download, Upload, and Computation to improve data bus (PCIe) and GPU utilization, and get better total execution time.
Typically, you concurrently
1. Download the output of a previous NTT back to the host
2. Upload the input for a next NTT on the device
3. Run current NTT
> [!NOTE]
> This approach requires two on-device memory vectors, decreasing the maximum size of NTT by 2x.
## Best-Practices
1. Use three separate CUDA streams for Download, Upload, and Compute operations
2. Use pinned (page-locked) memory on host to speed data bus transfers. Calling `cudaHostAlloc` allocates pinned memory.
3. Use in-place NTT to save on device memory.
## Running the example
To change the default curve BN254, edit `compile.sh` and `CMakeLists.txt`
```sh
./compile.sh
./run.sh
```
To compare with ICICLE baseline (i.e. non-concurrent) NTT, you can run [this example](../ntt/README.md).

View File

@@ -0,0 +1,16 @@
#!/bin/bash
# Exit immediately on error
set -e
mkdir -p build/example
mkdir -p build/icicle
# Configure and build Icicle
cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=OFF -DMSM=OFF
cmake --build build/icicle
# Configure and build the example application
cmake -S . -B build/example
cmake --build build/example

View File

@@ -0,0 +1,142 @@
#include <stdio.h>
#include <iostream>
#include <string>
#include <chrono>
#include "curves/params/bn254.cuh"
#include "api/bn254.h"
using namespace bn254;
using namespace ntt;
const std::string curve = "BN254";
typedef scalar_t S;
typedef scalar_t E;
const unsigned max_log_ntt_size = 27;
void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
for (unsigned i = 0; i < ntt_size * nof_ntts; i++) {
elements[i] = E::from(i+1);
}
}
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
int main(int argc, char** argv) {
cudaDeviceReset();
cudaDeviceProp deviceProperties;
int deviceId=0;
cudaGetDeviceProperties(&deviceProperties, deviceId);
std::string gpu_full_name = deviceProperties.name;
std::cout << gpu_full_name << std::endl;
std::string gpu_name = gpu_full_name;
std::cout << "Curve: " << curve << std::endl;
S basic_root = S::omega(max_log_ntt_size);
// change these parameters to match the desired NTT size and batch size
const unsigned log_ntt_size = 22;
const unsigned nof_ntts = 16;
std::cout << "log NTT size: " << log_ntt_size << std::endl;
const unsigned ntt_size = 1 << log_ntt_size;
std::cout << "Batch size: " << nof_ntts << std::endl;
// Create separate CUDA streams for overlapping data transfers and kernel execution.
cudaStream_t stream_compute, stream_h2d, stream_d2h;
cudaStreamCreate(&stream_compute);
cudaStreamCreate(&stream_h2d);
cudaStreamCreate(&stream_d2h);
// Create device context for NTT computation
auto ctx_compute = device_context::DeviceContext{
stream_compute, // stream
0, // device_id
0, // mempool
};
// Initialize NTT domain and configuration
bn254_initialize_domain(&basic_root, ctx_compute, /* fast twiddles */ true);
NTTConfig<S> config_compute = default_ntt_config<S>(ctx_compute);
config_compute.ntt_algorithm = NttAlgorithm::MixedRadix;
config_compute.batch_size = nof_ntts;
config_compute.are_inputs_on_device = true;
config_compute.are_outputs_on_device = true;
config_compute.is_async = true;
std::cout << "Concurrent Download, Upload, and Compute In-place NTT" << std::endl;
int nof_blocks = 32;
std::cout << "Number of blocks: " << nof_blocks << std::endl;
int block_size = ntt_size*nof_ntts/nof_blocks;
// on-host pinned data
E * h_inp[2];
E * h_out[2];
for (int i = 0; i < 2; i++) {
cudaHostAlloc((void**)&h_inp[i], sizeof(E)*ntt_size*nof_ntts, cudaHostAllocDefault);
cudaHostAlloc((void**)&h_out[i], sizeof(E)*ntt_size*nof_ntts, cudaHostAllocDefault);
}
// on-device in-place data
// we need two on-device vectors to overlap data transfers with NTT kernel execution
E * d_vec[2];
for (int i = 0; i < 2; i++) {
cudaMalloc((void**)&d_vec[i], sizeof(E)*ntt_size*nof_ntts);
}
// initialize input data
initialize_input(ntt_size, nof_ntts, h_inp[0]);
initialize_input(ntt_size, nof_ntts, h_inp[1]);
cudaEvent_t compute_start, compute_stop;
cudaEventCreate(&compute_start);
cudaEventCreate(&compute_stop);
for ( int run = 0; run < 10; run++ ) {
int vec_compute = run % 2;
int vec_transfer = (run + 1) % 2;
std::cout << "Run: " << run << std::endl;
std::cout << "Compute Vector: " << vec_compute << std::endl;
std::cout << "Transfer Vector: " << vec_transfer << std::endl;
START_TIMER(inplace);
cudaEventRecord(compute_start, stream_compute);
bn254_ntt_cuda(d_vec[vec_compute], ntt_size, NTTDir::kForward, config_compute, d_vec[vec_compute]);
cudaEventRecord(compute_stop, stream_compute);
// we have to delay upload to device relative to download from device by one block: preserve write after read
for (int i = 0; i <= nof_blocks; i++) {
if (i < nof_blocks) {
cudaMemcpyAsync(&h_out[vec_transfer][i*block_size], &d_vec[vec_transfer][i*block_size], sizeof(E)*block_size, cudaMemcpyDeviceToHost, stream_d2h);
}
if (i>0) {
cudaMemcpyAsync(&d_vec[vec_transfer][(i-1)*block_size], &h_inp[vec_transfer][(i-1)*block_size], sizeof(E)*block_size, cudaMemcpyHostToDevice, stream_h2d);
}
// synchronize upload and download at the end of the block to ensure data integrity
cudaStreamSynchronize(stream_d2h);
cudaStreamSynchronize(stream_h2d);
}
// synchronize compute stream with the end of the computation
cudaEventSynchronize(compute_stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, compute_start, compute_stop);
END_TIMER(inplace, "Concurrent In-Place NTT");
std::cout << "NTT time: " << milliseconds << " ms" << std::endl;
};
// Clean-up
for (int i = 0; i < 2; i++) {
cudaFree(d_vec[i]);
cudaFreeHost(h_inp[i]);
cudaFreeHost(h_out[i]);
}
cudaEventDestroy(compute_start);
cudaEventDestroy(compute_stop);
cudaStreamDestroy(stream_compute);
cudaStreamDestroy(stream_d2h);
cudaStreamDestroy(stream_h2d);
return 0;
}

View File

@@ -0,0 +1,2 @@
#!/bin/bash
./build/example/example