feat(cuda): introduce cuda acceleration for the pbs and keyswitch

- a new crate concrete-cuda is added to the repository, containing some
Cuda implementations for the bootstrap and keyswitch and a Rust wrapping
to call them
- a new backend_cuda is added to concrete-core, with dedicated entities
whose memory is located on the GPU and engines that call the Cuda
accelerated functions
This commit is contained in:
Agnes Leroy
2021-10-20 09:34:57 +02:00
committed by Agnès Leroy
commit 64521f6747
37 changed files with 15143 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
int nextPow2(int x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
void getNumBlocksAndThreads(const int n, const int maxBlockSize, int &blocks,
int &threads) {
threads = (n < maxBlockSize * 2) ? nextPow2((n + 1) / 2) : maxBlockSize;
blocks = (n + threads - 1) / threads;
}

90
src/utils/memory.cuh Normal file
View File

@@ -0,0 +1,90 @@
#ifndef CNCRT_SHMEM_H
#define CNCRT_SHMEM_H
#include "helper_cuda.h"
#include <atomic>
#include <iostream>
#include <mutex>
#include <thread>
#include <tuple>
#include <vector>
class SharedMemory {
public:
char *m_memory_block;
int m_last_byte;
__device__ SharedMemory(char *ptr) : m_memory_block(ptr), m_last_byte(0) {}
template <typename T> __device__ void get_allocation(T **ptr, int elements) {
*ptr = (T *)(&this->m_memory_block[m_last_byte]);
this->m_last_byte += elements * sizeof(T);
}
};
class DeviceMemory {
public:
std::vector<std::tuple<void *, int>> m_allocated;
std::mutex m_allocation_mtx;
std::atomic<uint32_t> m_total_devices;
DeviceMemory() : m_total_devices(1) {}
__host__ void set_device(int device) {
if (device > m_total_devices)
m_total_devices = device + 1;
}
template <typename T>
__host__ void get_allocation(T **ptr, int elements, int device) {
T *res;
cudaMalloc((void **)&res, sizeof(T) * elements);
*ptr = res;
std::lock_guard<std::mutex> lock(m_allocation_mtx);
m_allocated.push_back(std::make_tuple(res, device));
}
template <typename T>
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int elements,
int device) {
T *res;
cudaMalloc((void **)&res, sizeof(T) * elements);
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
*ptr = res;
std::lock_guard<std::mutex> lock(m_allocation_mtx);
m_allocated.push_back(std::make_tuple(res, device));
}
template <typename T>
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int allocation,
int elements, int device) {
T *res;
cudaMalloc((void **)&res, sizeof(T) * allocation);
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
*ptr = res;
std::lock_guard<std::mutex> lock(m_allocation_mtx);
m_allocated.push_back(std::make_tuple(res, device));
}
void free_all_from_device(int device) {
cudaSetDevice(device);
for (auto elem : m_allocated) {
auto dev = std::get<1>(elem);
if (dev == device) {
auto mem = std::get<0>(elem);
checkCudaErrors(cudaFree(mem));
}
}
}
__host__ ~DeviceMemory() {
for (auto elem : m_allocated) {
auto dev = std::get<1>(elem);
auto mem = std::get<0>(elem);
cudaSetDevice(dev);
checkCudaErrors(cudaFree(mem));
}
}
};
#endif // CNCRT_SHMEM_H

29
src/utils/timer.cuh Normal file
View File

@@ -0,0 +1,29 @@
#ifndef CNCRT_TIMER_H
#define CNCRT_TIMER_H
#define synchronize_threads_in_block() __syncthreads()
template <bool active> class CudaMeasureExecution {
public:
cudaEvent_t m_start, m_stop;
__host__ CudaMeasureExecution() {
if constexpr (active) {
cudaEventCreate(&m_start);
cudaEventCreate(&m_stop);
cudaEventRecord(m_start);
}
}
__host__ ~CudaMeasureExecution() {
if constexpr (active) {
float ms;
cudaEventRecord(m_stop);
cudaEventSynchronize(m_stop);
cudaEventElapsedTime(&ms, m_start, m_stop);
std::cout << "Execution took " << ms << "ms" << std::endl;
}
}
};
#endif // CNCRT_TIMER_H