mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-17 08:01:20 -05:00
feat(cuda): introduce cuda acceleration for the pbs and keyswitch
- a new crate concrete-cuda is added to the repository, containing some Cuda implementations for the bootstrap and keyswitch and a Rust wrapping to call them - a new backend_cuda is added to concrete-core, with dedicated entities whose memory is located on the GPU and engines that call the Cuda accelerated functions
This commit is contained in:
15
src/utils/kernel_dimensions.cuh
Normal file
15
src/utils/kernel_dimensions.cuh
Normal file
@@ -0,0 +1,15 @@
|
||||
int nextPow2(int x) {
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return ++x;
|
||||
}
|
||||
|
||||
void getNumBlocksAndThreads(const int n, const int maxBlockSize, int &blocks,
|
||||
int &threads) {
|
||||
threads = (n < maxBlockSize * 2) ? nextPow2((n + 1) / 2) : maxBlockSize;
|
||||
blocks = (n + threads - 1) / threads;
|
||||
}
|
||||
90
src/utils/memory.cuh
Normal file
90
src/utils/memory.cuh
Normal file
@@ -0,0 +1,90 @@
|
||||
#ifndef CNCRT_SHMEM_H
|
||||
#define CNCRT_SHMEM_H
|
||||
|
||||
#include "helper_cuda.h"
|
||||
#include <atomic>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
class SharedMemory {
|
||||
public:
|
||||
char *m_memory_block;
|
||||
int m_last_byte;
|
||||
|
||||
__device__ SharedMemory(char *ptr) : m_memory_block(ptr), m_last_byte(0) {}
|
||||
|
||||
template <typename T> __device__ void get_allocation(T **ptr, int elements) {
|
||||
*ptr = (T *)(&this->m_memory_block[m_last_byte]);
|
||||
this->m_last_byte += elements * sizeof(T);
|
||||
}
|
||||
};
|
||||
|
||||
class DeviceMemory {
|
||||
public:
|
||||
std::vector<std::tuple<void *, int>> m_allocated;
|
||||
std::mutex m_allocation_mtx;
|
||||
std::atomic<uint32_t> m_total_devices;
|
||||
|
||||
DeviceMemory() : m_total_devices(1) {}
|
||||
|
||||
__host__ void set_device(int device) {
|
||||
if (device > m_total_devices)
|
||||
m_total_devices = device + 1;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void get_allocation(T **ptr, int elements, int device) {
|
||||
T *res;
|
||||
cudaMalloc((void **)&res, sizeof(T) * elements);
|
||||
*ptr = res;
|
||||
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
||||
m_allocated.push_back(std::make_tuple(res, device));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int elements,
|
||||
int device) {
|
||||
T *res;
|
||||
cudaMalloc((void **)&res, sizeof(T) * elements);
|
||||
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
|
||||
*ptr = res;
|
||||
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
||||
m_allocated.push_back(std::make_tuple(res, device));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int allocation,
|
||||
int elements, int device) {
|
||||
T *res;
|
||||
cudaMalloc((void **)&res, sizeof(T) * allocation);
|
||||
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
|
||||
*ptr = res;
|
||||
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
||||
m_allocated.push_back(std::make_tuple(res, device));
|
||||
}
|
||||
|
||||
void free_all_from_device(int device) {
|
||||
cudaSetDevice(device);
|
||||
for (auto elem : m_allocated) {
|
||||
auto dev = std::get<1>(elem);
|
||||
if (dev == device) {
|
||||
auto mem = std::get<0>(elem);
|
||||
checkCudaErrors(cudaFree(mem));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__host__ ~DeviceMemory() {
|
||||
for (auto elem : m_allocated) {
|
||||
auto dev = std::get<1>(elem);
|
||||
auto mem = std::get<0>(elem);
|
||||
cudaSetDevice(dev);
|
||||
checkCudaErrors(cudaFree(mem));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CNCRT_SHMEM_H
|
||||
29
src/utils/timer.cuh
Normal file
29
src/utils/timer.cuh
Normal file
@@ -0,0 +1,29 @@
|
||||
#ifndef CNCRT_TIMER_H
|
||||
#define CNCRT_TIMER_H
|
||||
|
||||
#define synchronize_threads_in_block() __syncthreads()
|
||||
|
||||
template <bool active> class CudaMeasureExecution {
|
||||
public:
|
||||
cudaEvent_t m_start, m_stop;
|
||||
|
||||
__host__ CudaMeasureExecution() {
|
||||
if constexpr (active) {
|
||||
cudaEventCreate(&m_start);
|
||||
cudaEventCreate(&m_stop);
|
||||
cudaEventRecord(m_start);
|
||||
}
|
||||
}
|
||||
|
||||
__host__ ~CudaMeasureExecution() {
|
||||
if constexpr (active) {
|
||||
float ms;
|
||||
cudaEventRecord(m_stop);
|
||||
cudaEventSynchronize(m_stop);
|
||||
cudaEventElapsedTime(&ms, m_start, m_stop);
|
||||
std::cout << "Execution took " << ms << "ms" << std::endl;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CNCRT_TIMER_H
|
||||
Reference in New Issue
Block a user