#ifndef CNCRT_SHMEM_H #define CNCRT_SHMEM_H #include "helper_cuda.h" #include #include #include #include #include #include class SharedMemory { public: char *m_memory_block; int m_last_byte; __device__ SharedMemory(char *ptr) : m_memory_block(ptr), m_last_byte(0) {} template __device__ void get_allocation(T **ptr, int elements) { *ptr = (T *)(&this->m_memory_block[m_last_byte]); this->m_last_byte += elements * sizeof(T); } }; class DeviceMemory { public: std::vector> m_allocated; std::mutex m_allocation_mtx; std::atomic m_total_devices; DeviceMemory() : m_total_devices(1) {} __host__ void set_device(int device) { if (device > m_total_devices) m_total_devices = device + 1; } template __host__ void get_allocation(T **ptr, int elements, int device) { T *res; cudaMalloc((void **)&res, sizeof(T) * elements); *ptr = res; std::lock_guard lock(m_allocation_mtx); m_allocated.push_back(std::make_tuple(res, device)); } template __host__ void get_allocation_and_copy_async(T **ptr, T *src, int elements, int device) { T *res; cudaMalloc((void **)&res, sizeof(T) * elements); cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice); *ptr = res; std::lock_guard lock(m_allocation_mtx); m_allocated.push_back(std::make_tuple(res, device)); } template __host__ void get_allocation_and_copy_async(T **ptr, T *src, int allocation, int elements, int device) { T *res; cudaMalloc((void **)&res, sizeof(T) * allocation); cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice); *ptr = res; std::lock_guard lock(m_allocation_mtx); m_allocated.push_back(std::make_tuple(res, device)); } void free_all_from_device(int device) { cudaSetDevice(device); for (auto elem : m_allocated) { auto dev = std::get<1>(elem); if (dev == device) { auto mem = std::get<0>(elem); checkCudaErrors(cudaFree(mem)); } } } __host__ ~DeviceMemory() { for (auto elem : m_allocated) { auto dev = std::get<1>(elem); auto mem = std::get<0>(elem); cudaSetDevice(dev); checkCudaErrors(cudaFree(mem)); } } }; #endif // CNCRT_SHMEM_H