From 60d137de6e07ac7234a6ebaa331abbd5cd86f839 Mon Sep 17 00:00:00 2001 From: Guillermo Oyarzun Date: Thu, 7 Aug 2025 13:26:03 +0200 Subject: [PATCH] feat(gpu): use mempools to optimize mem reuse --- backends/tfhe-cuda-backend/cuda/src/device.cu | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu index 1df2a12c0..c6f0997d8 100644 --- a/backends/tfhe-cuda-backend/cuda/src/device.cu +++ b/backends/tfhe-cuda-backend/cuda/src/device.cu @@ -1,15 +1,88 @@ #include "device.h" #include #include +#include uint32_t cuda_get_device() { int device; check_cuda_error(cudaGetDevice(&device)); return static_cast(device); } +std::mutex pool_mutex; +bool mem_pools_enabled = false; + +// We use memory pools to reduce some overhead of memory allocations due +// to our scratch/release pattern. This function is the simplest way of using +// mempools, it modifies the default memory pool to use a threshold of 5% of the +// free memory: +// - Enabled opportunistic reuse to maximize reuse in malloc/free patterns +// - Prevent memory from being released back to the OS too soon if is within +// our threshold +// - Warm up the pool by allocating and freeing a large block of memory +// This function is called only once, the first time a GPU is set, and it +// configures all the GPUs available. +// We have measured an improvement of around 10% in our integer operations, +// especially the ones involving many allocations. +// We tested more complex configurations of mempools, but they did not yield +// better results. +void cuda_setup_mempool(uint32_t caller_gpu_index) { + if (!mem_pools_enabled) { + pool_mutex.lock(); + if (mem_pools_enabled) + return; // If mem pools are already enabled, we don't need to do anything + + // We do it only once for all GPUs + mem_pools_enabled = true; + uint32_t num_gpus = cuda_get_number_of_gpus(); + for (uint32_t gpu_index = 0; gpu_index < num_gpus; gpu_index++) { + cuda_set_device(gpu_index); + + size_t total_mem, free_mem; + check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem)); + + // If we have more than 5% of free memory, we can set up the mempool + uint64_t mem_pool_threshold = total_mem / 20; // 5% of total memory + mem_pool_threshold = + mem_pool_threshold - (mem_pool_threshold % 1024); // Align to 1KB + if (mem_pool_threshold < free_mem) { + // Get default memory pool + cudaMemPool_t default_pool; + check_cuda_error(cudaDeviceGetDefaultMemPool(&default_pool, gpu_index)); + + // Enable opportunistic reuse + int reuse = 1; + check_cuda_error(cudaMemPoolSetAttribute( + default_pool, cudaMemPoolReuseAllowOpportunistic, &reuse)); + + // Prevent memory from being released back to the OS too soon + check_cuda_error(cudaMemPoolSetAttribute( + default_pool, cudaMemPoolAttrReleaseThreshold, + &mem_pool_threshold)); + + // Warm up the pool by allocating and freeing a large block + cudaStream_t stream; + stream = cuda_create_stream(gpu_index); + void *warmup_ptr = nullptr; + warmup_ptr = cuda_malloc_async(mem_pool_threshold, stream, gpu_index); + cuda_drop_async(warmup_ptr, stream, gpu_index); + + // Sync to ensure pool is grown + cuda_synchronize_stream(stream, gpu_index); + + // Clean up + cuda_destroy_stream(stream, gpu_index); + } + } + // We return to the original gpu_index + cuda_set_device(caller_gpu_index); + pool_mutex.unlock(); + } +} void cuda_set_device(uint32_t gpu_index) { check_cuda_error(cudaSetDevice(gpu_index)); + // Mempools are initialized only once in all the GPUS available + cuda_setup_mempool(gpu_index); } cudaEvent_t cuda_create_event(uint32_t gpu_index) {