mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 19:44:57 -05:00
- a new crate concrete-cuda is added to the repository, containing some Cuda implementations for the bootstrap and keyswitch and a Rust wrapping to call them - a new backend_cuda is added to concrete-core, with dedicated entities whose memory is located on the GPU and engines that call the Cuda accelerated functions
91 lines
2.5 KiB
Plaintext
91 lines
2.5 KiB
Plaintext
#ifndef CNCRT_SHMEM_H
|
|
#define CNCRT_SHMEM_H
|
|
|
|
#include "helper_cuda.h"
|
|
#include <atomic>
|
|
#include <iostream>
|
|
#include <mutex>
|
|
#include <thread>
|
|
#include <tuple>
|
|
#include <vector>
|
|
|
|
class SharedMemory {
|
|
public:
|
|
char *m_memory_block;
|
|
int m_last_byte;
|
|
|
|
__device__ SharedMemory(char *ptr) : m_memory_block(ptr), m_last_byte(0) {}
|
|
|
|
template <typename T> __device__ void get_allocation(T **ptr, int elements) {
|
|
*ptr = (T *)(&this->m_memory_block[m_last_byte]);
|
|
this->m_last_byte += elements * sizeof(T);
|
|
}
|
|
};
|
|
|
|
class DeviceMemory {
|
|
public:
|
|
std::vector<std::tuple<void *, int>> m_allocated;
|
|
std::mutex m_allocation_mtx;
|
|
std::atomic<uint32_t> m_total_devices;
|
|
|
|
DeviceMemory() : m_total_devices(1) {}
|
|
|
|
__host__ void set_device(int device) {
|
|
if (device > m_total_devices)
|
|
m_total_devices = device + 1;
|
|
}
|
|
|
|
template <typename T>
|
|
__host__ void get_allocation(T **ptr, int elements, int device) {
|
|
T *res;
|
|
cudaMalloc((void **)&res, sizeof(T) * elements);
|
|
*ptr = res;
|
|
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
|
m_allocated.push_back(std::make_tuple(res, device));
|
|
}
|
|
|
|
template <typename T>
|
|
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int elements,
|
|
int device) {
|
|
T *res;
|
|
cudaMalloc((void **)&res, sizeof(T) * elements);
|
|
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
|
|
*ptr = res;
|
|
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
|
m_allocated.push_back(std::make_tuple(res, device));
|
|
}
|
|
|
|
template <typename T>
|
|
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int allocation,
|
|
int elements, int device) {
|
|
T *res;
|
|
cudaMalloc((void **)&res, sizeof(T) * allocation);
|
|
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
|
|
*ptr = res;
|
|
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
|
m_allocated.push_back(std::make_tuple(res, device));
|
|
}
|
|
|
|
void free_all_from_device(int device) {
|
|
cudaSetDevice(device);
|
|
for (auto elem : m_allocated) {
|
|
auto dev = std::get<1>(elem);
|
|
if (dev == device) {
|
|
auto mem = std::get<0>(elem);
|
|
checkCudaErrors(cudaFree(mem));
|
|
}
|
|
}
|
|
}
|
|
|
|
__host__ ~DeviceMemory() {
|
|
for (auto elem : m_allocated) {
|
|
auto dev = std::get<1>(elem);
|
|
auto mem = std::get<0>(elem);
|
|
cudaSetDevice(dev);
|
|
checkCudaErrors(cudaFree(mem));
|
|
}
|
|
}
|
|
};
|
|
|
|
#endif // CNCRT_SHMEM_H
|