icicle/examples/c++/multi-gpu-poseidon/example.cu

#include <iostream>
#include <thread>
#include <chrono>
#include <nvml.h>

#include "api/bn254.h"
#include "gpu-utils/error_handler.cuh"

#include "poseidon/poseidon.cuh"
#include "hash/hash.cuh"

using namespace poseidon;
using namespace bn254;

void checkCudaError(cudaError_t error)
{
  if (error != cudaSuccess) {
    std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
    // Handle the error, e.g., exit the program or throw an exception.
  }
}

// these global constants go into template calls
const int size_col = 11;

void threadPoseidon(
  device_context::DeviceContext ctx,
  unsigned size_partition,
  scalar_t* layers,
  scalar_t* column_hashes,
  Poseidon<scalar_t> * poseidon)
{
  cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id));
  if (err_result != cudaSuccess) {
    std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
    return;
  }
  HashConfig column_config = default_hash_config(ctx);
  cudaError_t err = poseidon->hash_many(layers, column_hashes, (size_t) size_partition, size_col, 1, column_config);
  checkCudaError(err);
}

using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg)                                                                                          \
  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());

#define CHECK_ALLOC(ptr)                                                                                               \
  if ((ptr) == nullptr) {                                                                                              \
    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl;                                              \
    exit(EXIT_FAILURE);                                                                                                \
  }

#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
    exit(EXIT_FAILURE); \
}

int main()
{
  const unsigned size_row = (1 << 30);
  const unsigned nof_partitions = 64;
  const unsigned size_partition = size_row / nof_partitions;
  // layers is allocated only for one partition, need to reuse for different partitions
  const uint32_t size_layers = size_col * size_partition;

  nvmlInit();
  unsigned int deviceCount;
  nvmlDeviceGetCount(&deviceCount);
  std::cout << "Available GPUs: " << deviceCount << std::endl;

  for (unsigned int i = 0; i < deviceCount; ++i) {
    nvmlDevice_t device;
    nvmlMemory_t memory;
    char name[NVML_DEVICE_NAME_BUFFER_SIZE];
    nvmlDeviceGetHandleByIndex(i, &device);
    nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
    nvmlDeviceGetMemoryInfo(device, &memory);
    std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total / 1024 / 1024
              << "/" << memory.free / 1024 / 1024 << std::endl;
  }

  const unsigned memory_partition = sizeof(scalar_t) * (size_col + 1) * size_partition / 1024 / 1024;
  std::cout << "Required Memory (MiB) " << memory_partition << std::endl;

  //===============================================================================
  // Key: multiple devices are supported by device context
  //===============================================================================

  device_context::DeviceContext ctx0 = device_context::get_default_device_context();
  ctx0.device_id = 0;
  device_context::DeviceContext ctx1 = device_context::get_default_device_context();
  ctx1.device_id = 1;

  std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
  scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
  CHECK_ALLOC(layers0);
  scalar_t s = scalar_t::zero();
  for (unsigned i = 0; i < size_col * size_partition; i++) {
    layers0[i] = s;
    s = s + scalar_t::one();
  }
  scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
  CHECK_ALLOC(layers1);
  s = scalar_t::zero() + scalar_t::one();
  for (unsigned i = 0; i < size_col * size_partition; i++) {
    layers1[i] = s;
    s = s + scalar_t::one();
  }

  scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
  CHECK_ALLOC(column_hash0);
  scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
  CHECK_ALLOC(column_hash1);

    Poseidon<scalar_t> column_poseidon0(size_col, ctx0);
    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx1.device_id));
    if (err_result != cudaSuccess) {
        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
        return;
    }
    Poseidon<scalar_t> column_poseidon1(size_col, ctx1);

  std::cout << "Parallel execution of Poseidon threads" << std::endl;
  START_TIMER(parallel);
  std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_poseidon0);
  std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_poseidon1);

  // Wait for the threads to finish
  thread0.join();
  thread1.join();
  END_TIMER(parallel, "2 GPUs");
  std::cout << "Output Data from Thread 0: ";
  std::cout << column_hash0[0] << std::endl;
  std::cout << "Output Data from Thread 1: ";
  std::cout << column_hash1[0] << std::endl;

  std::cout << "Sequential execution of Poseidon threads" << std::endl;
  START_TIMER(sequential);
  std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_poseidon0);
  thread2.join();
  std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_poseidon0);
  thread3.join();
  END_TIMER(sequential, "1 GPU");
  std::cout << "Output Data from Thread 2: ";
  std::cout << column_hash0[0] << std::endl;
  std::cout << "Output Data from Thread 3: ";
  std::cout << column_hash1[0] << std::endl;

  nvmlShutdown();
  return 0;
}