mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-10 16:07:59 -05:00
Temp/stas/muli gpu example (#381)
## Describe the changes This PR adds Multi-GPU Poseidon example ## Linked Issues Some minor on-device memory issues require attention from devs, please help
This commit is contained in:
25
examples/c++/multi-gpu-poseidon/CMakeLists.txt
Normal file
25
examples/c++/multi-gpu-poseidon/CMakeLists.txt
Normal file
@@ -0,0 +1,25 @@
|
||||
cmake_minimum_required(VERSION 3.18)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CUDA_STANDARD 17)
|
||||
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
|
||||
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
|
||||
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
|
||||
endif ()
|
||||
project(icicle LANGUAGES CUDA CXX)
|
||||
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
|
||||
set(CMAKE_CUDA_FLAGS_RELEASE "")
|
||||
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
|
||||
# change the path to your Icicle location
|
||||
include_directories("../../../icicle")
|
||||
add_executable(
|
||||
example
|
||||
example.cu
|
||||
)
|
||||
find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
|
||||
target_link_libraries(example ${NVML_LIBRARY})
|
||||
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
||||
|
||||
52
examples/c++/multi-gpu-poseidon/README.md
Normal file
52
examples/c++/multi-gpu-poseidon/README.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# Icicle example: using multiple GPU to hash large dataset
|
||||
|
||||
## Best-Practices
|
||||
|
||||
This example builds on [single GPU Poseidon example](../poseidon/README.md) so we recommend to run it first.
|
||||
|
||||
## Key-Takeaway
|
||||
|
||||
Use `device_context::DeviceContext` variable to select GPU to use.
|
||||
Use C++ threads to compute `Icicle` primitives on different GPUs in parallel.
|
||||
|
||||
## Concise Usage Explanation
|
||||
|
||||
1. Include c++ threads
|
||||
|
||||
```c++
|
||||
#include <thread>
|
||||
```
|
||||
|
||||
2. Define a __thread function__. Importantly, device context `ctx` will hold the GPU id.
|
||||
|
||||
```c++
|
||||
void threadPoseidon(device_context::DeviceContext ctx, ...) {...}
|
||||
```
|
||||
|
||||
3. Initialize device contexts for different GPUs
|
||||
|
||||
```c++
|
||||
device_context::DeviceContext ctx0 = device_context::get_default_device_context();
|
||||
ctx0.device_id=0;
|
||||
device_context::DeviceContext ctx1 = device_context::get_default_device_context();
|
||||
ctx1.device_id=1;
|
||||
```
|
||||
|
||||
4. Finally, spawn the threads and wait for their completion
|
||||
|
||||
```c++
|
||||
std::thread thread0(threadPoseidon, ctx0, ...);
|
||||
std::thread thread1(threadPoseidon, ctx1, ...);
|
||||
thread0.join();
|
||||
thread1.join();
|
||||
```
|
||||
|
||||
## What's in the example
|
||||
|
||||
This is a **toy** example executing the first step of the Filecoin's Pre-Commit 2 phase: compute $2^{30}$ Poseison hashes for each column of $11 \times 2^{30}$ matrix.
|
||||
|
||||
1. Define the size of the example: $2^{30}$ won't fit on a typical machine, so we partition the problem into `nof_partitions`
|
||||
2. Hash two partitions in parallel on two GPUs
|
||||
3. Hash two partitions in series on one GPU
|
||||
4. Compare execution times
|
||||
|
||||
9
examples/c++/multi-gpu-poseidon/compile.sh
Executable file
9
examples/c++/multi-gpu-poseidon/compile.sh
Executable file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Exit immediately on error
|
||||
set -e
|
||||
|
||||
rm -rf build
|
||||
mkdir -p build
|
||||
cmake -S . -B build
|
||||
cmake --build build
|
||||
148
examples/c++/multi-gpu-poseidon/example.cu
Normal file
148
examples/c++/multi-gpu-poseidon/example.cu
Normal file
@@ -0,0 +1,148 @@
|
||||
#include <iostream>
|
||||
#include <thread>
|
||||
#include <chrono>
|
||||
|
||||
#include <nvml.h>
|
||||
|
||||
// select the curve
|
||||
#define CURVE_ID 2
|
||||
#include "appUtils/poseidon/poseidon.cu"
|
||||
#include "utils/error_handler.cuh"
|
||||
|
||||
using namespace poseidon;
|
||||
using namespace curve_config;
|
||||
|
||||
void checkCudaError(cudaError_t error) {
|
||||
if (error != cudaSuccess) {
|
||||
std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
|
||||
// Handle the error, e.g., exit the program or throw an exception.
|
||||
}
|
||||
}
|
||||
|
||||
// these global constants go into template calls
|
||||
const int size_col = 11;
|
||||
|
||||
// this function executes the Poseidon thread
|
||||
void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) {
|
||||
cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id));
|
||||
if (err_result != cudaSuccess) {
|
||||
std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
|
||||
return;
|
||||
}
|
||||
// CHK_IF_RETURN(); I can't use it in a standard thread function
|
||||
PoseidonConfig column_config = {
|
||||
ctx, // ctx
|
||||
false, // are_inputes_on_device
|
||||
false, // are_outputs_on_device
|
||||
false, // input_is_a_state
|
||||
false, // aligned
|
||||
false, // loop_state
|
||||
false, // is_async
|
||||
};
|
||||
cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, *constants, column_config);
|
||||
checkCudaError(err);
|
||||
}
|
||||
|
||||
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
|
||||
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
|
||||
#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
|
||||
|
||||
|
||||
#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
|
||||
std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
|
||||
exit(EXIT_FAILURE); \
|
||||
}
|
||||
|
||||
int main() {
|
||||
const unsigned size_row = (1<<30);
|
||||
const unsigned nof_partitions = 64;
|
||||
const unsigned size_partition = size_row / nof_partitions;
|
||||
// layers is allocated only for one partition, need to reuse for different partitions
|
||||
const uint32_t size_layers = size_col * size_partition;
|
||||
|
||||
nvmlInit();
|
||||
unsigned int deviceCount;
|
||||
nvmlDeviceGetCount(&deviceCount);
|
||||
std::cout << "Available GPUs: " << deviceCount << std::endl;
|
||||
|
||||
for (unsigned int i = 0; i < deviceCount; ++i) {
|
||||
nvmlDevice_t device;
|
||||
nvmlMemory_t memory;
|
||||
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
|
||||
nvmlDeviceGetHandleByIndex(i, &device);
|
||||
nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
|
||||
nvmlDeviceGetMemoryInfo(device, &memory);
|
||||
std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/" << memory.free/1024/1024 << std::endl;
|
||||
}
|
||||
|
||||
const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024;
|
||||
std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
|
||||
|
||||
//===============================================================================
|
||||
// Key: multiple devices are supported by device context
|
||||
//===============================================================================
|
||||
|
||||
device_context::DeviceContext ctx0 = device_context::get_default_device_context();
|
||||
ctx0.device_id=0;
|
||||
device_context::DeviceContext ctx1 = device_context::get_default_device_context();
|
||||
ctx1.device_id=1;
|
||||
|
||||
std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
|
||||
scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
|
||||
CHECK_ALLOC(layers0);
|
||||
scalar_t s = scalar_t::zero();
|
||||
for (unsigned i = 0; i < size_col*size_partition ; i++) {
|
||||
layers0[i] = s;
|
||||
s = s + scalar_t::one();
|
||||
}
|
||||
scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
|
||||
CHECK_ALLOC(layers1);
|
||||
s = scalar_t::zero() + scalar_t::one();
|
||||
for (unsigned i = 0; i < size_col*size_partition ; i++) {
|
||||
layers1[i] = s;
|
||||
s = s + scalar_t::one();
|
||||
}
|
||||
|
||||
scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
|
||||
CHECK_ALLOC(column_hash0);
|
||||
scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
|
||||
CHECK_ALLOC(column_hash1);
|
||||
|
||||
PoseidonConstants<scalar_t> column_constants0, column_constants1;
|
||||
init_optimized_poseidon_constants<scalar_t>(size_col, ctx0, &column_constants0);
|
||||
cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id));
|
||||
if (err_result != cudaSuccess) {
|
||||
std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
|
||||
return;
|
||||
}
|
||||
init_optimized_poseidon_constants<scalar_t>(size_col, ctx1, &column_constants1);
|
||||
|
||||
std::cout << "Parallel execution of Poseidon threads" << std::endl;
|
||||
START_TIMER(parallel);
|
||||
std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
|
||||
std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
|
||||
|
||||
// Wait for the threads to finish
|
||||
thread0.join();
|
||||
thread1.join();
|
||||
END_TIMER(parallel,"2 GPUs");
|
||||
std::cout << "Output Data from Thread 0: ";
|
||||
std::cout << column_hash0[0] << std::endl;
|
||||
std::cout << "Output Data from Thread 1: ";
|
||||
std::cout << column_hash1[0] << std::endl;
|
||||
|
||||
std::cout << "Sequential execution of Poseidon threads" << std::endl;
|
||||
START_TIMER(sequential);
|
||||
std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
|
||||
thread2.join();
|
||||
std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
|
||||
thread3.join();
|
||||
END_TIMER(sequential,"1 GPU");
|
||||
std::cout << "Output Data from Thread 2: ";
|
||||
std::cout << column_hash0[0] << std::endl;
|
||||
std::cout << "Output Data from Thread 3: ";
|
||||
std::cout << column_hash1[0] << std::endl;
|
||||
|
||||
nvmlShutdown();
|
||||
return 0;
|
||||
}
|
||||
2
examples/c++/multi-gpu-poseidon/run.sh
Executable file
2
examples/c++/multi-gpu-poseidon/run.sh
Executable file
@@ -0,0 +1,2 @@
|
||||
#!/bin/bash
|
||||
./build/example
|
||||
Reference in New Issue
Block a user