Files
concrete/src/utils/memory.cuh
Agnes Leroy 64521f6747 feat(cuda): introduce cuda acceleration for the pbs and keyswitch
- a new crate concrete-cuda is added to the repository, containing some
Cuda implementations for the bootstrap and keyswitch and a Rust wrapping
to call them
- a new backend_cuda is added to concrete-core, with dedicated entities
whose memory is located on the GPU and engines that call the Cuda
accelerated functions
2022-06-27 09:10:20 +02:00

91 lines
2.5 KiB
Plaintext

#ifndef CNCRT_SHMEM_H
#define CNCRT_SHMEM_H
#include "helper_cuda.h"
#include <atomic>
#include <iostream>
#include <mutex>
#include <thread>
#include <tuple>
#include <vector>
class SharedMemory {
public:
char *m_memory_block;
int m_last_byte;
__device__ SharedMemory(char *ptr) : m_memory_block(ptr), m_last_byte(0) {}
template <typename T> __device__ void get_allocation(T **ptr, int elements) {
*ptr = (T *)(&this->m_memory_block[m_last_byte]);
this->m_last_byte += elements * sizeof(T);
}
};
class DeviceMemory {
public:
std::vector<std::tuple<void *, int>> m_allocated;
std::mutex m_allocation_mtx;
std::atomic<uint32_t> m_total_devices;
DeviceMemory() : m_total_devices(1) {}
__host__ void set_device(int device) {
if (device > m_total_devices)
m_total_devices = device + 1;
}
template <typename T>
__host__ void get_allocation(T **ptr, int elements, int device) {
T *res;
cudaMalloc((void **)&res, sizeof(T) * elements);
*ptr = res;
std::lock_guard<std::mutex> lock(m_allocation_mtx);
m_allocated.push_back(std::make_tuple(res, device));
}
template <typename T>
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int elements,
int device) {
T *res;
cudaMalloc((void **)&res, sizeof(T) * elements);
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
*ptr = res;
std::lock_guard<std::mutex> lock(m_allocation_mtx);
m_allocated.push_back(std::make_tuple(res, device));
}
template <typename T>
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int allocation,
int elements, int device) {
T *res;
cudaMalloc((void **)&res, sizeof(T) * allocation);
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
*ptr = res;
std::lock_guard<std::mutex> lock(m_allocation_mtx);
m_allocated.push_back(std::make_tuple(res, device));
}
void free_all_from_device(int device) {
cudaSetDevice(device);
for (auto elem : m_allocated) {
auto dev = std::get<1>(elem);
if (dev == device) {
auto mem = std::get<0>(elem);
checkCudaErrors(cudaFree(mem));
}
}
}
__host__ ~DeviceMemory() {
for (auto elem : m_allocated) {
auto dev = std::get<1>(elem);
auto mem = std::get<0>(elem);
cudaSetDevice(dev);
checkCudaErrors(cudaFree(mem));
}
}
};
#endif // CNCRT_SHMEM_H