diff --git a/Makefile b/Makefile index 132f58b1f..655979048 100644 --- a/Makefile +++ b/Makefile @@ -994,6 +994,19 @@ test_high_level_api: install_rs_build_toolchain --features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p tfhe \ -- high_level_api:: +test_high_level_api_gpu_one: install_rs_build_toolchain install_cargo_nextest + RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test \ + --features=integer,internal-keycache,gpu,zk-pok -p tfhe \ + -- --nocapture high_level_api::array::tests::booleans::test_gpu_only_bitand + +# +test_high_level_api_gpu_mul: install_rs_build_toolchain install_cargo_nextest + RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test \ + --features=integer,internal-keycache,gpu-debug --profile release \ + -p tfhe \ + -- --nocapture integer::gpu::server_key::radix::tests_unsigned::test_mul:: \ + --test-threads=6 + test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \ --test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \ diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h index 908cfb717..95e65dc71 100644 --- a/backends/tfhe-cuda-backend/cuda/include/device.h +++ b/backends/tfhe-cuda-backend/cuda/include/device.h @@ -5,6 +5,8 @@ #include #include #include +#include +#include #define CUDA_STREAM_POOL @@ -83,13 +85,27 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index); uint32_t cuda_is_available(); void *cuda_malloc(uint64_t size, uint32_t gpu_index); +void *cuda_ext_malloc(uint64_t size, uint32_t gpu_index); -void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream, - uint32_t gpu_index, - uint64_t &size_tracker, - bool allocate_gpu_memory); +void *cuda_intern_malloc_with_size_tracking_async(uint64_t size, + cudaStream_t stream, + uint32_t gpu_index, + uint64_t &size_tracker, + bool allocate_gpu_memory, + const char *file, int line); -void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index); +#define cuda_malloc_with_size_tracking_async( \ + size, stream, gpu_index, size_tracker, allocate_gpu_memory) \ + cuda_intern_malloc_with_size_tracking_async( \ + size, stream, gpu_index, size_tracker, allocate_gpu_memory, __FILE__, \ + __LINE__) + +void *cuda_int_malloc_async(uint64_t size, cudaStream_t stream, + uint32_t gpu_index, const char *file, int line); +#define cuda_malloc_async(size, stream, gpu_index) \ + cuda_int_malloc_async(size, stream, gpu_index, __FILE__, __LINE__) +void *cuda_ext_malloc_async(uint64_t size, cudaStream_t stream, + uint32_t gpu_index); bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index); uint64_t cuda_device_total_memory(uint32_t gpu_index); @@ -103,18 +119,28 @@ void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src, void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size, cudaStream_t stream, uint32_t gpu_index); +void cuda_ext_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size, + cudaStream_t stream, uint32_t gpu_index); + void cuda_memcpy_with_size_tracking_async_gpu_to_gpu( void *dest, void const *src, uint64_t size, cudaStream_t stream, uint32_t gpu_index, bool gpu_memory_allocated); void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size, cudaStream_t stream, uint32_t gpu_index); +void cuda_ext_memcpy_async_gpu_to_gpu(void *dest, void const *src, + uint64_t size, cudaStream_t stream, + uint32_t gpu_index); void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size, uint32_t gpu_index); +void cuda_ext_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size, + uint32_t gpu_index); void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, cudaStream_t stream, uint32_t gpu_index); +void cuda_ext_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, + cudaStream_t stream, uint32_t gpu_index); void cuda_memset_with_size_tracking_async(void *dest, uint64_t val, uint64_t size, cudaStream_t stream, @@ -123,6 +149,8 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val, void cuda_memset_async(void *dest, uint64_t val, uint64_t size, cudaStream_t stream, uint32_t gpu_index); +void cuda_ext_memset_async(void *dest, uint64_t val, uint64_t size, + cudaStream_t stream, uint32_t gpu_index); int cuda_get_number_of_gpus(); @@ -130,13 +158,26 @@ int cuda_get_number_of_sms(); void cuda_synchronize_device(uint32_t gpu_index); -void cuda_drop(void *ptr, uint32_t gpu_index); +void cuda_int_drop(void *ptr, uint32_t gpu_index, const char *file, int line); +#define cuda_drop(ptr, gpu_index) \ + cuda_int_drop(ptr, gpu_index, __FILE__, __LINE__) -void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream, - uint32_t gpu_index, - bool gpu_memory_allocated); +void cuda_ext_drop(void *ptr, uint32_t gpu_index); -void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index); +void cuda_int_drop_with_size_tracking_async(void *ptr, cudaStream_t stream, + uint32_t gpu_index, + bool gpu_memory_allocated, + const char *file, int line); + +#define cuda_drop_with_size_tracking_async(ptr, stream, gpu_index, \ + gpu_memory_allocated) \ + cuda_int_drop_with_size_tracking_async( \ + ptr, stream, gpu_index, gpu_memory_allocated, __FILE__, __LINE__) + +void cuda_int_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index, + const char *file, int line); +#define cuda_drop_async(ptr, stream, gpu_index) \ + cuda_int_drop_async(ptr, stream, gpu_index, __FILE__, __LINE__) } uint32_t cuda_get_max_shared_memory(uint32_t gpu_index); diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu index e067ff54f..2433edd73 100644 --- a/backends/tfhe-cuda-backend/cuda/src/device.cu +++ b/backends/tfhe-cuda-backend/cuda/src/device.cu @@ -3,18 +3,33 @@ #include #include #include +#include +#include #include + #ifdef USE_NVTOOLS #include #endif #ifdef CUDA_STREAM_POOL #include -#include #include #include #endif +#include + +#define USE_MEMORY_MANAGER +// #define DEBUG_MEMORY_MANAGER +#define MAX_CACHE_SIZE (1 << 30) + +#ifdef USE_MEMORY_MANAGER +#include +#include +#include +#include +#endif + uint32_t cuda_get_device() { int device; check_cuda_error(cudaGetDevice(&device)); @@ -99,6 +114,330 @@ void cuda_set_device(uint32_t gpu_index) { #endif } +#ifdef USE_MEMORY_MANAGER + +enum CudaMemBlockUsageType { CUDA_ALLOC = 0, MEMSET, MEMCPY_SRC, MEMCPY_DEST, FREE }; + +enum CudaAllocType { SYNC = 0, ASYNC }; + +#ifdef DEBUG_MEMORY_MANAGER +struct CudaMemBlockUsage { + std::string location; + uint64_t timestamp; + CudaMemBlockUsageType type; +}; +#endif + +struct CudaMemBlock { + int8_t *ptr; + uint64_t size; + cudaStream_t stream; + uint32_t gpu_index; + size_t thread_id; + CudaAllocType alloc_type; +#ifdef DEBUG_MEMORY_MANAGER + std::vector usages; +#endif +}; + +class CudaMemoryManager { + std::list cuda_allocs; // fresh allocs + std::list cuda_freed; // freed for good + + std::unordered_map>> + cache; // freed and re-used + uint64_t cache_size = 0, peak_cache_size = 0; + + std::mutex allocs_mutex; + +#ifdef DEBUG_MEMORY_MANAGER + std::string make_location(const char *file, int line) { + std::stringstream sstr; + sstr << file << ":" << line; + return sstr.str(); + } + uint64_t make_timestamp() { + const std::chrono::time_point now = + std::chrono::system_clock::now(); + + auto us = std::chrono::duration_cast( + now.time_since_epoch()) + .count() % + 1000000; + return us; + } + + void check_range_is_valid(CudaMemBlockUsageType usage_type, int8_t *dest, + uint64_t size, cudaStream_t stream, + const char *file, int line) { + CudaMemBlockUsage usage = {make_location(file, line), make_timestamp(), + usage_type}; + + const char *info = NULL; + switch (usage_type) { + case MEMSET: + info = "memset"; + break; + case MEMCPY_SRC: + info = "memcpy source"; + break; + case MEMCPY_DEST: + info = "memcpy dest"; + break; + default: + info = "unknown"; + } + + auto device_id = cuda_get_device(); + + bool found = false; + for (auto it = cuda_allocs.begin(); it != cuda_allocs.end(); it++) { + if (it->ptr == dest && it->gpu_index == device_id) { + printf("%s with size tracking: found ptr %p\n", info, dest); + if (size > it->size) { + PANIC("%s OF %lu bytes TOO BIG TO %p OF SIZE %ld\n", info, size, dest, + it->size); + } + it->usages.push_back(usage); + found = true; + } else { + if (dest > it->ptr && dest < it->ptr + it->size && + it->gpu_index == device_id) { + printf("%s with size tracking: indirect ptr %p in buffer %p\n", info, + dest, it->ptr); + if (dest + size > it->ptr + it->size) { + auto remain_bytes = it->ptr + it->size - dest; + PANIC("%s OF %lu bytes TOO BIG TO %p WHICH HAS ROOM ONLY FOR %d\n", + info, size, dest, remain_bytes); + } + it->usages.push_back(usage); + found = true; + } + } + } + if (!found) { + PANIC("Cuda %s to %p of size %lu, unknown pointer", info, dest, size); + } + } +#endif + +public: + void alloc(void **ptr, uint64_t size, CudaAllocType alloc_type, + uint32_t gpu_index, cudaStream_t stream, const char *file, + int line) { + std::lock_guard guard(allocs_mutex); + + auto cache_of_stream = cache.find(stream); + if (cache_of_stream != cache.end()) { + auto cache_of_size = cache_of_stream->second.find(size); + if (cache_of_size != cache_of_stream->second.end() && + !cache_of_size->second.empty()) { + auto cached_alloc = cache_of_size->second.front(); + cache_of_size->second.pop_front(); + + // move to active allocs + cuda_allocs.push_back(cached_alloc); + *ptr = cached_alloc.ptr; + + if (cache_size < size) { + PANIC("INVALID CACHE USE!!"); + } + + cache_size -= size; + +#ifdef DEBUG_MEMORY_MANAGER + printf("Cuda Allocation serviced from cache: %p of size %lu on gpu %d " + "in %s\n", + ptr, size, gpu_index, ""); +#endif + return; + } + } + + cuda_set_device(gpu_index); + if (alloc_type == SYNC) { + check_cuda_error(cudaMalloc(ptr, size)); + } else if (alloc_type == ASYNC) { +#ifndef CUDART_VERSION +#error CUDART_VERSION Undefined! +#elif (CUDART_VERSION >= 11020) + int support_async_alloc; + check_cuda_error(cudaDeviceGetAttribute( + &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index)); + + if (support_async_alloc) { + check_cuda_error(cudaMallocAsync(ptr, size, stream)); + } else { + check_cuda_error(cudaMalloc(ptr, size)); + } +#else + check_cuda_error(cudaMalloc((void **)&ptr, size)); +#endif + } else { + PANIC("Invalid allocation mode"); + } + + if (*ptr == nullptr) { + if (size > 0) { + PANIC("Allocation failed for %lu bytes, allocator returned %p", size, + ptr); + } + return; + } + + auto thread_id = std::hash{}(std::this_thread::get_id()); + CudaMemBlock block = {(int8_t *)*ptr, size, stream, + gpu_index, thread_id, alloc_type}; +#ifdef DEBUG_MEMORY_MANAGER + CudaMemBlockUsage usage = {make_location(file, line), make_timestamp(), + CUDA_ALLOC}; + block.usages.push_back(usage); + + printf("Cuda Allocated %p of size %lu on gpu %d in %s\n", ptr, size, + gpu_index, usage.location.c_str()); +#endif + + cuda_allocs.push_back(block); + } + void memset(int8_t *dest, uint64_t size, cudaStream_t stream, + const char *file, int line) { +#ifdef DEBUG_MEMORY_MANAGER + std::lock_guard guard(allocs_mutex); + + check_range_is_valid(MEMSET, dest, size, stream, file, line); +#endif + } + + void memcpy(int8_t *dest, int8_t *src, uint64_t size, cudaStream_t stream, + const char *file, int line) { +#ifdef DEBUG_MEMORY_MANAGER + std::lock_guard guard(allocs_mutex); + + check_range_is_valid(MEMCPY_SRC, src, size, stream, file, line); + check_range_is_valid(MEMCPY_DEST, src, size, stream, file, line); +#endif + } + + void free(void *ptr, CudaAllocType alloc_type, uint32_t gpu_index, + cudaStream_t stream, const char *file, int line) { + if (ptr == nullptr) + return; + + std::lock_guard guard(allocs_mutex); + + bool found = false; + bool must_free = false; + + for (auto it = cuda_allocs.begin(); it != cuda_allocs.end(); it++) { + if (it->ptr == ptr && it->gpu_index == gpu_index) { + found = true; + + if (cache_size + it->size < (MAX_CACHE_SIZE)) { + cache[stream][it->size].push_back(*it); + cache_size += it->size; + if (peak_cache_size < cache_size) { + peak_cache_size = cache_size; + } + } else { + cuda_freed.push_back(*it); + must_free = true; + } +#ifdef DEBUG_MEMORY_MANAGER + printf("cuda dropped buffer %p of size %lu on gpu %d\n", ptr, it->size, + gpu_index); +#endif + cuda_allocs.erase(it++); + } + } + + if (must_free) { + cuda_set_device(gpu_index); + if (alloc_type == SYNC) { + check_cuda_error(cudaFree(ptr)); + } else if (alloc_type == ASYNC) { +#ifndef CUDART_VERSION +#error CUDART_VERSION Undefined! +#elif (CUDART_VERSION >= 11020) + int support_async_alloc; + check_cuda_error(cudaDeviceGetAttribute( + &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index)); + + if (support_async_alloc) { + check_cuda_error(cudaFreeAsync(ptr, stream)); + } else { + check_cuda_error(cudaFree(ptr)); + } +#else + check_cuda_error(cudaFree(ptr)); +#endif + } + } + +#ifdef DEBUG_MEMORY_MANAGER + if (!found) { + for (auto it = cuda_freed.begin(); it != cuda_freed.end(); it++) { + if (it->ptr == ptr && it->gpu_index == gpu_index) { + found = true; + printf("Drop in %s: %d\n", file, line); + printf("Alloc in %s\n", it->usages[0].location.c_str()); + PANIC("cuda drop already dropped buffer %p of size %lu on gpu %d\n", + ptr, it->size, gpu_index); + } + } + } + + if (!found) { + PANIC("cuda drop unknown buffer %p\n", ptr); + } +#endif + } + + ~CudaMemoryManager() { +#ifdef DEBUG_MEMORY_MANAGER + printf("%lu ALLOCATIONS AT PROGRAM EXIT\n", cuda_allocs.size()); + + for (auto &cuda_alloc : cuda_allocs) { + printf("%p of size %lu allocated at %s\n", cuda_alloc.ptr, + cuda_alloc.size, cuda_alloc.usages[0].location.c_str()); + } + + printf("\n\n\n %llu PEAK CACHE SIZE\n", peak_cache_size); + + for (auto &cache_for_size : cache) { + for (auto &cuda_alloc : cache_for_size.second) { + printf("%p of size %lu cached at %s\n", cuda_alloc.ptr, cuda_alloc.size, + cuda_alloc.usages[0].location.c_str()); + } + } +#endif + } +}; + +class CudaMultiGPUMemoryManager { + std::unordered_map gMemManagers; + std::mutex gMemManagersMutex; // for creation of the mem managers + std::atomic gMemManagerExists = 0; + +public: + CudaMemoryManager &get(uint32_t gpu_index) { + if (gMemManagerExists.load() & (1 << gpu_index)) { + return gMemManagers[gpu_index]; + } else { + std::lock_guard guard(gMemManagersMutex); + uint32_t exist_flags = gMemManagerExists.load(); + if (!(exist_flags & (1 << gpu_index))) { + gMemManagers[gpu_index]; // create it + gMemManagerExists.store(exist_flags | (1 << gpu_index)); + } + return gMemManagers[gpu_index]; + } + } +}; + +CudaMultiGPUMemoryManager gCudaMemoryManager; +#endif + cudaEvent_t cuda_create_event(uint32_t gpu_index) { cuda_set_device(gpu_index); cudaEvent_t event; @@ -221,24 +560,38 @@ uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; } /// or if there's not enough memory. A safe wrapper around it must call /// cuda_check_valid_malloc() first void *cuda_malloc(uint64_t size, uint32_t gpu_index) { + void *ptr = nullptr; +#ifdef USE_MEMORY_MANAGER + gCudaMemoryManager.get(gpu_index).alloc(&ptr, size, SYNC, gpu_index, 0, + "rust_code", 0); +#else cuda_set_device(gpu_index); - void *ptr; check_cuda_error(cudaMalloc((void **)&ptr, size)); - +#endif return ptr; } +void *cuda_ext_malloc(uint64_t size, uint32_t gpu_index) { + return cuda_malloc(size, gpu_index); +} + /// Allocates a size-byte array at the device memory. Tries to do it /// asynchronously. -void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream, - uint32_t gpu_index, - uint64_t &size_tracker, - bool allocate_gpu_memory) { +void *cuda_intern_malloc_with_size_tracking_async(uint64_t size, + cudaStream_t stream, + uint32_t gpu_index, + uint64_t &size_tracker, + bool allocate_gpu_memory, + const char *file, int line) { size_tracker += size; void *ptr = nullptr; if (!allocate_gpu_memory) return ptr; +#ifdef USE_MEMORY_MANAGER + gCudaMemoryManager.get(gpu_index).alloc(&ptr, size, ASYNC, gpu_index, stream, + file, line); +#else cuda_set_device(gpu_index); #ifndef CUDART_VERSION @@ -256,16 +609,23 @@ void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream, #else check_cuda_error(cudaMalloc((void **)&ptr, size)); #endif +#endif + return ptr; } /// Allocates a size-byte array at the device memory. Tries to do it /// asynchronously. -void *cuda_malloc_async(uint64_t size, cudaStream_t stream, - uint32_t gpu_index) { +void *cuda_int_malloc_async(uint64_t size, cudaStream_t stream, + uint32_t gpu_index, const char *file, int line) { uint64_t size_tracker = 0; - return cuda_malloc_with_size_tracking_async(size, stream, gpu_index, - size_tracker, true); + return cuda_intern_malloc_with_size_tracking_async( + size, stream, gpu_index, size_tracker, true, file, line); +} + +void *cuda_ext_malloc_async(uint64_t size, cudaStream_t stream, + uint32_t gpu_index) { + return cuda_malloc_async(size, stream, gpu_index); } /// Check that allocation is valid @@ -340,6 +700,11 @@ void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size, gpu_index, true); } +void cuda_ext_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size, + cudaStream_t stream, uint32_t gpu_index) { + cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index); +} + /// Copy memory within a GPU asynchronously void cuda_memcpy_with_size_tracking_async_gpu_to_gpu( void *dest, void const *src, uint64_t size, cudaStream_t stream, @@ -372,6 +737,12 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size, gpu_index, true); } +void cuda_ext_memcpy_async_gpu_to_gpu(void *dest, void const *src, + uint64_t size, cudaStream_t stream, + uint32_t gpu_index) { + cuda_memcpy_async_gpu_to_gpu(dest, src, size, stream, gpu_index); +} + /// Copy memory within a GPU void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size, uint32_t gpu_index) { @@ -396,6 +767,11 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size, } } +void cuda_ext_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size, + uint32_t gpu_index) { + cuda_memcpy_gpu_to_gpu(dest, src, size, gpu_index); +} + /// Synchronizes device void cuda_synchronize_device(uint32_t gpu_index) { cuda_set_device(gpu_index); @@ -408,6 +784,7 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val, bool gpu_memory_allocated) { if (size == 0 || !gpu_memory_allocated) return; + cudaPointerAttributes attr; check_cuda_error(cudaPointerGetAttributes(&attr, dest)); if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) { @@ -415,6 +792,7 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val, } cuda_set_device(gpu_index); check_cuda_error(cudaMemsetAsync(dest, val, size, stream)); + gCudaMemoryManager.get(gpu_index).memset((int8_t *)dest, size, stream, "", 0); } void cuda_memset_async(void *dest, uint64_t val, uint64_t size, @@ -423,6 +801,11 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size, true); } +void cuda_ext_memset_async(void *dest, uint64_t val, uint64_t size, + cudaStream_t stream, uint32_t gpu_index) { + cuda_memset_async(dest, val, size, stream, gpu_index); +} + template __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) { int index = threadIdx.x + blockIdx.x * blockDim.x; @@ -474,6 +857,11 @@ void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream)); } +void cuda_ext_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, + cudaStream_t stream, uint32_t gpu_index) { + cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index); +} + /// Return number of GPUs available int cuda_get_number_of_gpus() { int num_gpus; @@ -489,19 +877,31 @@ int cuda_get_number_of_sms() { } /// Drop a cuda array -void cuda_drop(void *ptr, uint32_t gpu_index) { +void cuda_int_drop(void *ptr, uint32_t gpu_index, const char *file, int line) { +#ifdef USE_MEMORY_MANAGER + gCudaMemoryManager.get(gpu_index).free(ptr, SYNC, gpu_index, 0, file, line); +#else cuda_set_device(gpu_index); check_cuda_error(cudaFree(ptr)); +#endif } +void cuda_ext_drop(void *ptr, uint32_t gpu_index) { cuda_drop(ptr, gpu_index); } + /// Drop a cuda array asynchronously, if the data was allocated & it's supported /// on the device -void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream, - uint32_t gpu_index, - bool gpu_memory_allocated) { +void cuda_int_drop_with_size_tracking_async(void *ptr, cudaStream_t stream, + uint32_t gpu_index, + bool gpu_memory_allocated, + const char *file, int line) { if (!gpu_memory_allocated) return; + +#ifdef USE_MEMORY_MANAGER + gCudaMemoryManager.get(gpu_index).free(ptr, ASYNC, gpu_index, stream, file, + line); +#else cuda_set_device(gpu_index); #ifndef CUDART_VERSION #error CUDART_VERSION Undefined! @@ -518,11 +918,14 @@ void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream, #else check_cuda_error(cudaFree(ptr)); #endif +#endif } /// Drop a cuda array asynchronously, if supported on the device -void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) { - cuda_drop_with_size_tracking_async(ptr, stream, gpu_index, true); +void cuda_int_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index, + const char *file, int line) { + cuda_int_drop_with_size_tracking_async(ptr, stream, gpu_index, true, file, + line); } /// Get the maximum size for the shared memory per streaming multiprocessors diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index 72c0434b3..a5531a926 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -11,21 +11,13 @@ extern "C" { pub fn cuda_is_available() -> u32; - pub fn cuda_malloc(size: u64, gpu_index: u32) -> *mut c_void; + pub fn cuda_ext_malloc(size: u64, gpu_index: u32) -> *mut c_void; - pub fn cuda_malloc_with_size_tracking_async( - size: u64, - stream: *mut c_void, - gpu_index: u32, - size_tracker: *mut u64, - allocate_gpu_memory: bool, - ) -> *mut c_void; - - pub fn cuda_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void; + pub fn cuda_ext_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void; pub fn cuda_check_valid_malloc(size: u64, gpu_index: u32) -> bool; pub fn cuda_device_total_memory(gpu_index: u32) -> u64; - pub fn cuda_memcpy_with_size_tracking_async_to_gpu( + pub fn cuda_ext_memcpy_with_size_tracking_async_to_gpu( dest: *mut c_void, src: *const c_void, size: u64, @@ -34,7 +26,7 @@ extern "C" { gpu_memory_allocated: bool, ); - pub fn cuda_memcpy_async_to_gpu( + pub fn cuda_ext_memcpy_async_to_gpu( dest: *mut c_void, src: *const c_void, size: u64, @@ -42,9 +34,14 @@ extern "C" { gpu_index: u32, ); - pub fn cuda_memcpy_gpu_to_gpu(dest: *mut c_void, src: *const c_void, size: u64, gpu_index: u32); + pub fn cuda_ext_memcpy_gpu_to_gpu( + dest: *mut c_void, + src: *const c_void, + size: u64, + gpu_index: u32, + ); - pub fn cuda_memcpy_with_size_tracking_async_gpu_to_gpu( + pub fn cuda_ext_memcpy_with_size_tracking_async_gpu_to_gpu( dest: *mut c_void, src: *const c_void, size: u64, @@ -53,7 +50,7 @@ extern "C" { gpu_memory_allocated: bool, ); - pub fn cuda_memcpy_async_gpu_to_gpu( + pub fn cuda_ext_memcpy_async_gpu_to_gpu( dest: *mut c_void, src: *const c_void, size: u64, @@ -61,7 +58,7 @@ extern "C" { gpu_index: u32, ); - pub fn cuda_memcpy_async_to_cpu( + pub fn cuda_ext_memcpy_async_to_cpu( dest: *mut c_void, src: *const c_void, size: u64, @@ -78,7 +75,7 @@ extern "C" { gpu_memory_allocated: bool, ); - pub fn cuda_memset_async( + pub fn cuda_ext_memset_async( dest: *mut c_void, val: u64, size: u64, @@ -92,7 +89,7 @@ extern "C" { pub fn cuda_synchronize_device(gpu_index: u32); - pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32); + pub fn cuda_ext_drop(ptr: *mut c_void, gpu_index: u32); pub fn cuda_drop_with_size_tracking_async( ptr: *mut c_void, @@ -102,7 +99,7 @@ extern "C" { allocate_gpu_memory: bool, ); - pub fn cuda_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32); + pub fn cuda_ext_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32); pub fn cuda_setup_multi_gpu(gpu_index: u32) -> i32; diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs index 9b666a49d..4969d305a 100644 --- a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs +++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs @@ -4,7 +4,7 @@ use crate::core_crypto::prelude::{ CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList, LweDimension, LweSize, UnsignedInteger, }; -use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_gpu_to_gpu; +use tfhe_cuda_backend::cuda_bind::cuda_ext_memcpy_async_gpu_to_gpu; /// A structure representing a vector of LWE ciphertexts with 64 bits of precision on the GPU. #[derive(Clone, Debug)] @@ -123,7 +123,7 @@ impl CudaLweCiphertextList { * std::mem::size_of::(); // Concatenate gpu_index memory unsafe { - cuda_memcpy_async_gpu_to_gpu( + cuda_ext_memcpy_async_gpu_to_gpu( ptr, first_item.0.d_vec.as_c_ptr(0), size as u64, @@ -132,7 +132,7 @@ impl CudaLweCiphertextList { ); ptr = ptr.wrapping_byte_add(size); for list in cuda_ciphertexts_list_vec { - cuda_memcpy_async_gpu_to_gpu( + cuda_ext_memcpy_async_gpu_to_gpu( ptr, list.0.d_vec.as_c_ptr(0), size as u64, diff --git a/tfhe/src/core_crypto/gpu/slice.rs b/tfhe/src/core_crypto/gpu/slice.rs index db0409c3a..fc8bc97c5 100644 --- a/tfhe/src/core_crypto/gpu/slice.rs +++ b/tfhe/src/core_crypto/gpu/slice.rs @@ -3,7 +3,9 @@ use crate::core_crypto::gpu::CudaStreams; use crate::core_crypto::prelude::Numeric; use std::ffi::c_void; use std::marker::PhantomData; -use tfhe_cuda_backend::cuda_bind::{cuda_memcpy_async_gpu_to_gpu, cuda_memcpy_async_to_cpu}; +use tfhe_cuda_backend::cuda_bind::{ + cuda_ext_memcpy_async_gpu_to_gpu, cuda_ext_memcpy_async_to_cpu, +}; #[derive(Debug, Clone)] pub struct CudaSlice<'a, T: Numeric> { @@ -98,7 +100,7 @@ where let size = src.len(index) * std::mem::size_of::(); // We check that src is not empty to avoid invalid pointers if size > 0 { - cuda_memcpy_async_gpu_to_gpu( + cuda_ext_memcpy_async_gpu_to_gpu( self.as_mut_c_ptr(index), src.as_c_ptr(index), size as u64, @@ -123,7 +125,7 @@ where let size = self.len(index) * std::mem::size_of::(); // We check that src is not empty to avoid invalid pointers if size > 0 { - cuda_memcpy_async_to_cpu( + cuda_ext_memcpy_async_to_cpu( dest.as_mut_ptr().cast::(), self.as_c_ptr(index), size as u64, diff --git a/tfhe/src/core_crypto/gpu/vec.rs b/tfhe/src/core_crypto/gpu/vec.rs index 661f92c6d..ca90a71a6 100644 --- a/tfhe/src/core_crypto/gpu/vec.rs +++ b/tfhe/src/core_crypto/gpu/vec.rs @@ -6,9 +6,9 @@ use std::collections::Bound::{Excluded, Included, Unbounded}; use std::ffi::c_void; use std::marker::PhantomData; use tfhe_cuda_backend::cuda_bind::{ - cuda_drop, cuda_malloc, cuda_malloc_async, cuda_memcpy_async_gpu_to_gpu, - cuda_memcpy_async_to_cpu, cuda_memcpy_async_to_gpu, cuda_memcpy_gpu_to_gpu, cuda_memset_async, - cuda_synchronize_device, + cuda_ext_drop, cuda_ext_malloc, cuda_ext_malloc_async, cuda_ext_memcpy_async_gpu_to_gpu, + cuda_ext_memcpy_async_to_cpu, cuda_ext_memcpy_async_to_gpu, cuda_ext_memcpy_gpu_to_gpu, + cuda_ext_memset_async, cuda_synchronize_device, }; #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -74,8 +74,8 @@ impl Clone for CudaVec { for (index, &gpu_index) in self.gpu_indexes.iter().enumerate() { unsafe { cuda_synchronize_device(gpu_index.0); - let ptr = cuda_malloc(size, gpu_index.0); - cuda_memcpy_gpu_to_gpu(ptr, self.ptr[index], size, gpu_index.0); + let ptr = cuda_ext_malloc(size, gpu_index.0); + cuda_ext_memcpy_gpu_to_gpu(ptr, self.ptr[index], size, gpu_index.0); cloned_vec.push(ptr); } } @@ -101,12 +101,12 @@ impl CudaVec { /// - `streams` __must__ be synchronized to guarantee computation has finished pub unsafe fn new_async(len: usize, streams: &CudaStreams, stream_index: u32) -> Self { let size = len as u64 * std::mem::size_of::() as u64; - let ptr = cuda_malloc_async( + let ptr = cuda_ext_malloc_async( size, streams.ptr[stream_index as usize], streams.gpu_indexes[stream_index as usize].0, ); - cuda_memset_async( + cuda_ext_memset_async( ptr, 0u64, size, @@ -129,9 +129,9 @@ impl CudaVec { let mut ptrs = Vec::with_capacity(streams.len()); for (i, &stream_ptr) in streams.ptr.iter().enumerate() { let gpu_index = streams.gpu_indexes[i]; - let ptr = unsafe { cuda_malloc_async(size, stream_ptr, gpu_index.0) }; + let ptr = unsafe { cuda_ext_malloc_async(size, stream_ptr, gpu_index.0) }; unsafe { - cuda_memset_async(ptr, 0u64, size, stream_ptr, gpu_index.0); + cuda_ext_memset_async(ptr, 0u64, size, stream_ptr, gpu_index.0); } streams.synchronize_one(i as u32); ptrs.push(ptr); @@ -179,7 +179,7 @@ impl CudaVec { let size = self.len() * std::mem::size_of::(); // We check that self is not empty to avoid invalid pointers if size > 0 { - cuda_memset_async( + cuda_ext_memset_async( self.as_mut_c_ptr(stream_index), value, size as u64, @@ -209,7 +209,7 @@ impl CudaVec { // We have to check that src is not empty, because Rust slice with size 0 results in an // invalid pointer being passed to copy_to_gpu_async if size > 0 { - cuda_memcpy_async_to_gpu( + cuda_ext_memcpy_async_to_gpu( self.as_mut_c_ptr(stream_index), src.as_ptr().cast(), size as u64, @@ -237,7 +237,7 @@ impl CudaVec { // invalid pointer being passed to copy_to_gpu_async if size > 0 { let gpu_index = streams.gpu_indexes[i]; - cuda_memcpy_async_to_gpu( + cuda_ext_memcpy_async_to_gpu( self.get_mut_c_ptr(i as u32), src.as_ptr().cast(), size as u64, @@ -266,7 +266,7 @@ impl CudaVec { let size = src.len() * std::mem::size_of::(); // We check that src is not empty to avoid invalid pointers if size > 0 { - cuda_memcpy_async_gpu_to_gpu( + cuda_ext_memcpy_async_gpu_to_gpu( self.as_mut_c_ptr(stream_index), src.as_c_ptr(stream_index), size as u64, @@ -305,7 +305,7 @@ impl CudaVec { .as_c_ptr(stream_index) .add(start * std::mem::size_of::()); let size = (end - start + 1) * std::mem::size_of::(); - cuda_memcpy_async_gpu_to_gpu( + cuda_ext_memcpy_async_gpu_to_gpu( self.as_mut_c_ptr(stream_index), src_ptr, size as u64, @@ -342,7 +342,7 @@ impl CudaVec { .as_mut_c_ptr(stream_index) .add(start * std::mem::size_of::()); let size = (end - start + 1) * std::mem::size_of::(); - cuda_memcpy_async_gpu_to_gpu( + cuda_ext_memcpy_async_gpu_to_gpu( dest_ptr, src.as_c_ptr(stream_index), size as u64, @@ -366,7 +366,7 @@ impl CudaVec { // We have to check that self is not empty, because Rust slice with size 0 results in an // invalid pointer being passed to copy_to_cpu_async if size > 0 { - cuda_memcpy_async_to_cpu( + cuda_ext_memcpy_async_to_cpu( dest.as_mut_ptr().cast(), self.as_c_ptr(stream_index), size as u64, @@ -484,7 +484,7 @@ impl Drop for CudaVec { // Synchronizes the device to be sure no stream is still using this pointer let gpu_index = self.gpu_indexes[i]; synchronize_device(gpu_index.0); - unsafe { cuda_drop(ptr, gpu_index.0) }; + unsafe { cuda_ext_drop(ptr, gpu_index.0) }; } } } diff --git a/tfhe/src/integer/gpu/ciphertext/compact_list.rs b/tfhe/src/integer/gpu/ciphertext/compact_list.rs index 4b9bbfca7..f94b730c9 100644 --- a/tfhe/src/integer/gpu/ciphertext/compact_list.rs +++ b/tfhe/src/integer/gpu/ciphertext/compact_list.rs @@ -20,7 +20,7 @@ use crate::shortint::{AtomicPatternKind, CarryModulus, Ciphertext, MessageModulu use crate::GpuIndex; use itertools::Itertools; use serde::Deserializer; -use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_to_gpu; +use tfhe_cuda_backend::cuda_bind::cuda_ext_memcpy_async_to_gpu; #[derive(Clone)] pub struct CudaCompactCiphertextListInfo { @@ -271,7 +271,7 @@ impl CudaFlattenedVecCompactCiphertextList { let dest_ptr = d_flattened_d_vec .as_mut_c_ptr(0) .add(offset * std::mem::size_of::()); - cuda_memcpy_async_to_gpu( + cuda_ext_memcpy_async_to_gpu( dest_ptr, container.as_ptr().cast(), (expected_length * std::mem::size_of::()) as u64, diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs index 27eaf100e..41a14fb14 100644 --- a/tfhe/src/integer/gpu/list_compression/server_keys.rs +++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs @@ -15,7 +15,7 @@ use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphe use crate::integer::gpu::ciphertext::CudaRadixCiphertext; use crate::integer::gpu::server_key::CudaBootstrappingKey; use crate::integer::gpu::{ - compress_integer_radix_async, cuda_memcpy_async_gpu_to_gpu, decompress_integer_radix_async_64, + compress_integer_radix_async, cuda_ext_memcpy_async_gpu_to_gpu, decompress_integer_radix_async_64, get_compression_size_on_gpu, get_decompression_size_on_gpu, }; use crate::prelude::CastInto; @@ -248,7 +248,7 @@ impl CudaCompressionKey { .as_mut_c_ptr(0) .add(offset * std::mem::size_of::()); let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::(); - cuda_memcpy_async_gpu_to_gpu( + cuda_ext_memcpy_async_gpu_to_gpu( dest_ptr, ciphertext.d_blocks.0.d_vec.as_c_ptr(0), size as u64, @@ -577,7 +577,7 @@ impl CudaNoiseSquashingCompressionKey { .as_mut_c_ptr(0) .add(offset * std::mem::size_of::()); let size = ciphertext.packed_d_blocks.0.d_vec.len * std::mem::size_of::(); - cuda_memcpy_async_gpu_to_gpu( + cuda_ext_memcpy_async_gpu_to_gpu( dest_ptr, ciphertext.packed_d_blocks.0.d_vec.as_c_ptr(0), size as u64,