feat(gpu): add memory management

2026-01-08 22:28:01 -05:00 · 2025-06-30 20:28:13 +04:00
parent 70fa68bf52
commit 846eed184e
9 changed files with 530 additions and 74 deletions
--- a/13
+++ b/13
@@ -994,6 +994,19 @@ test_high_level_api: install_rs_build_toolchain
 		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p tfhe \
 		-- high_level_api::
 test_high_level_api_gpu_one: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test  \
 		--features=integer,internal-keycache,gpu,zk-pok -p tfhe \
 		-- --nocapture high_level_api::array::tests::booleans::test_gpu_only_bitand
 #
 test_high_level_api_gpu_mul:  install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN)  test  \
        --features=integer,internal-keycache,gpu-debug --profile release \
        -p tfhe  \
        -- --nocapture integer::gpu::server_key::radix::tests_unsigned::test_mul:: \
        --test-threads=6
 test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
 		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -5,6 +5,8 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cuda_runtime.h>
 #include <fstream>
 #include <vector>
 #define CUDA_STREAM_POOL
@@ -83,13 +85,27 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);
 uint32_t cuda_is_available();
 void *cuda_malloc(uint64_t size, uint32_t gpu_index);
 void *cuda_ext_malloc(uint64_t size, uint32_t gpu_index);
-void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
+void *cuda_intern_malloc_with_size_tracking_async(uint64_t size,
-                                           uint32_t gpu_index,
+                                                  cudaStream_t stream,
-                                           uint64_t &size_tracker,
+                                                  uint32_t gpu_index,
-                                           bool allocate_gpu_memory);
+                                                  uint64_t &size_tracker,
                                                  bool allocate_gpu_memory,
                                                  const char *file, int line);
-void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
+#define cuda_malloc_with_size_tracking_async(                                  \
    size, stream, gpu_index, size_tracker, allocate_gpu_memory)                \
  cuda_intern_malloc_with_size_tracking_async(                                 \
      size, stream, gpu_index, size_tracker, allocate_gpu_memory, __FILE__,    \
      __LINE__)
 void *cuda_int_malloc_async(uint64_t size, cudaStream_t stream,
                            uint32_t gpu_index, const char *file, int line);
 #define cuda_malloc_async(size, stream, gpu_index)                             \
  cuda_int_malloc_async(size, stream, gpu_index, __FILE__, __LINE__)
 void *cuda_ext_malloc_async(uint64_t size, cudaStream_t stream,
                            uint32_t gpu_index);
 bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
 uint64_t cuda_device_total_memory(uint32_t gpu_index);
@@ -103,18 +119,28 @@ void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
 void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);
 void cuda_ext_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
                                  cudaStream_t stream, uint32_t gpu_index);
 void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
    void *dest, void const *src, uint64_t size, cudaStream_t stream,
    uint32_t gpu_index, bool gpu_memory_allocated);
 void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                                  cudaStream_t stream, uint32_t gpu_index);
 void cuda_ext_memcpy_async_gpu_to_gpu(void *dest, void const *src,
                                      uint64_t size, cudaStream_t stream,
                                      uint32_t gpu_index);
 void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                            uint32_t gpu_index);
 void cuda_ext_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                                uint32_t gpu_index);
 void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);
 void cuda_ext_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                                  cudaStream_t stream, uint32_t gpu_index);
 void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
                                          uint64_t size, cudaStream_t stream,
@@ -123,6 +149,8 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
 void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
                       cudaStream_t stream, uint32_t gpu_index);
 void cuda_ext_memset_async(void *dest, uint64_t val, uint64_t size,
                           cudaStream_t stream, uint32_t gpu_index);
 int cuda_get_number_of_gpus();
@@ -130,13 +158,26 @@ int cuda_get_number_of_sms();
 void cuda_synchronize_device(uint32_t gpu_index);
-void cuda_drop(void *ptr, uint32_t gpu_index);
+void cuda_int_drop(void *ptr, uint32_t gpu_index, const char *file, int line);
 #define cuda_drop(ptr, gpu_index)                                              \
  cuda_int_drop(ptr, gpu_index, __FILE__, __LINE__)
-void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
+void cuda_ext_drop(void *ptr, uint32_t gpu_index);
                                        uint32_t gpu_index,
                                        bool gpu_memory_allocated);
-void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
+void cuda_int_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
                                            uint32_t gpu_index,
                                            bool gpu_memory_allocated,
                                            const char *file, int line);
 #define cuda_drop_with_size_tracking_async(ptr, stream, gpu_index,             \
                                           gpu_memory_allocated)               \
  cuda_int_drop_with_size_tracking_async(                                      \
      ptr, stream, gpu_index, gpu_memory_allocated, __FILE__, __LINE__)
 void cuda_int_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index,
                         const char *file, int line);
 #define cuda_drop_async(ptr, stream, gpu_index)                                \
  cuda_int_drop_async(ptr, stream, gpu_index, __FILE__, __LINE__)
 }
 uint32_t cuda_get_max_shared_memory(uint32_t gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -3,18 +3,33 @@
 #include <atomic>
 #include <cstdint>
 #include <cuda_runtime.h>
 #include <deque>
 #include <unordered_map>
 #include <mutex>
 #ifdef USE_NVTOOLS
 #include <cuda_profiler_api.h>
 #endif
 #ifdef CUDA_STREAM_POOL
 #include <deque>
 #include <mutex>
 #include <vector>
 #include <unordered_map>
 #endif
 #include <bits/this_thread_sleep.h>
 #define USE_MEMORY_MANAGER
 // #define DEBUG_MEMORY_MANAGER
 #define MAX_CACHE_SIZE (1 << 30)
 #ifdef USE_MEMORY_MANAGER
 #include <list>
 #include <sstream>
 #include <string>
 #include <thread>
 #endif
 uint32_t cuda_get_device() {
  int device;
  check_cuda_error(cudaGetDevice(&device));
@@ -99,6 +114,330 @@ void cuda_set_device(uint32_t gpu_index) {
 #endif
 }
 #ifdef USE_MEMORY_MANAGER
 enum CudaMemBlockUsageType { CUDA_ALLOC = 0, MEMSET, MEMCPY_SRC, MEMCPY_DEST, FREE };
 enum CudaAllocType { SYNC = 0, ASYNC };
 #ifdef DEBUG_MEMORY_MANAGER
 struct CudaMemBlockUsage {
  std::string location;
  uint64_t timestamp;
  CudaMemBlockUsageType type;
 };
 #endif
 struct CudaMemBlock {
  int8_t *ptr;
  uint64_t size;
  cudaStream_t stream;
  uint32_t gpu_index;
  size_t thread_id;
  CudaAllocType alloc_type;
 #ifdef DEBUG_MEMORY_MANAGER
  std::vector<CudaMemBlockUsage> usages;
 #endif
 };
 class CudaMemoryManager {
  std::list<CudaMemBlock> cuda_allocs; // fresh allocs
  std::list<CudaMemBlock> cuda_freed;  // freed for good
  std::unordered_map<cudaStream_t,
                     std::unordered_map<uint64_t, std::deque<CudaMemBlock>>>
      cache; // freed and re-used
  uint64_t cache_size = 0, peak_cache_size = 0;
  std::mutex allocs_mutex;
 #ifdef DEBUG_MEMORY_MANAGER
  std::string make_location(const char *file, int line) {
    std::stringstream sstr;
    sstr << file << ":" << line;
    return sstr.str();
  }
  uint64_t make_timestamp() {
    const std::chrono::time_point<std::chrono::system_clock> now =
        std::chrono::system_clock::now();
    auto us = std::chrono::duration_cast<std::chrono::microseconds>(
                  now.time_since_epoch())
                  .count() %
              1000000;
    return us;
  }
  void check_range_is_valid(CudaMemBlockUsageType usage_type, int8_t *dest,
                            uint64_t size, cudaStream_t stream,
                            const char *file, int line) {
    CudaMemBlockUsage usage = {make_location(file, line), make_timestamp(),
                               usage_type};
    const char *info = NULL;
    switch (usage_type) {
    case MEMSET:
      info = "memset";
      break;
    case MEMCPY_SRC:
      info = "memcpy source";
      break;
    case MEMCPY_DEST:
      info = "memcpy dest";
      break;
    default:
      info = "unknown";
    }
    auto device_id = cuda_get_device();
    bool found = false;
    for (auto it = cuda_allocs.begin(); it != cuda_allocs.end(); it++) {
      if (it->ptr == dest && it->gpu_index == device_id) {
        printf("%s with size tracking: found ptr %p\n", info, dest);
        if (size > it->size) {
          PANIC("%s OF %lu bytes TOO BIG TO %p OF SIZE %ld\n", info, size, dest,
                it->size);
        }
        it->usages.push_back(usage);
        found = true;
      } else {
        if (dest > it->ptr && dest < it->ptr + it->size &&
            it->gpu_index == device_id) {
          printf("%s with size tracking: indirect ptr %p in buffer %p\n", info,
                 dest, it->ptr);
          if (dest + size > it->ptr + it->size) {
            auto remain_bytes = it->ptr + it->size - dest;
            PANIC("%s OF %lu bytes TOO BIG TO %p WHICH HAS ROOM ONLY FOR %d\n",
                  info, size, dest, remain_bytes);
          }
          it->usages.push_back(usage);
          found = true;
        }
      }
    }
    if (!found) {
      PANIC("Cuda %s to %p of size %lu, unknown pointer", info, dest, size);
    }
  }
 #endif
 public:
  void alloc(void **ptr, uint64_t size, CudaAllocType alloc_type,
             uint32_t gpu_index, cudaStream_t stream, const char *file,
             int line) {
    std::lock_guard<std::mutex> guard(allocs_mutex);
    auto cache_of_stream = cache.find(stream);
    if (cache_of_stream != cache.end()) {
      auto cache_of_size = cache_of_stream->second.find(size);
      if (cache_of_size != cache_of_stream->second.end() &&
          !cache_of_size->second.empty()) {
        auto cached_alloc = cache_of_size->second.front();
        cache_of_size->second.pop_front();
        // move to active allocs
        cuda_allocs.push_back(cached_alloc);
        *ptr = cached_alloc.ptr;
        if (cache_size < size) {
          PANIC("INVALID CACHE USE!!");
        }
        cache_size -= size;
 #ifdef DEBUG_MEMORY_MANAGER
        printf("Cuda Allocation serviced from cache: %p of size %lu on gpu %d "
               "in %s\n",
               ptr, size, gpu_index, "");
 #endif
        return;
      }
    }
    cuda_set_device(gpu_index);
    if (alloc_type == SYNC) {
      check_cuda_error(cudaMalloc(ptr, size));
    } else if (alloc_type == ASYNC) {
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
 #elif (CUDART_VERSION >= 11020)
      int support_async_alloc;
      check_cuda_error(cudaDeviceGetAttribute(
          &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
      if (support_async_alloc) {
        check_cuda_error(cudaMallocAsync(ptr, size, stream));
      } else {
        check_cuda_error(cudaMalloc(ptr, size));
      }
 #else
      check_cuda_error(cudaMalloc((void **)&ptr, size));
 #endif
    } else {
      PANIC("Invalid allocation mode");
    }
    if (*ptr == nullptr) {
      if (size > 0) {
        PANIC("Allocation failed for %lu bytes, allocator returned %p", size,
              ptr);
      }
      return;
    }
    auto thread_id = std::hash<std::thread::id>{}(std::this_thread::get_id());
    CudaMemBlock block = {(int8_t *)*ptr, size,      stream,
                          gpu_index,      thread_id, alloc_type};
 #ifdef DEBUG_MEMORY_MANAGER
    CudaMemBlockUsage usage = {make_location(file, line), make_timestamp(),
                               CUDA_ALLOC};
    block.usages.push_back(usage);
    printf("Cuda Allocated %p of size %lu on gpu %d in %s\n", ptr, size,
           gpu_index, usage.location.c_str());
 #endif
    cuda_allocs.push_back(block);
  }
  void memset(int8_t *dest, uint64_t size, cudaStream_t stream,
              const char *file, int line) {
 #ifdef DEBUG_MEMORY_MANAGER
    std::lock_guard<std::mutex> guard(allocs_mutex);
    check_range_is_valid(MEMSET, dest, size, stream, file, line);
 #endif
  }
  void memcpy(int8_t *dest, int8_t *src, uint64_t size, cudaStream_t stream,
              const char *file, int line) {
 #ifdef DEBUG_MEMORY_MANAGER
    std::lock_guard<std::mutex> guard(allocs_mutex);
    check_range_is_valid(MEMCPY_SRC, src, size, stream, file, line);
    check_range_is_valid(MEMCPY_DEST, src, size, stream, file, line);
 #endif
  }
  void free(void *ptr, CudaAllocType alloc_type, uint32_t gpu_index,
            cudaStream_t stream, const char *file, int line) {
    if (ptr == nullptr)
      return;
    std::lock_guard<std::mutex> guard(allocs_mutex);
    bool found = false;
    bool must_free = false;
    for (auto it = cuda_allocs.begin(); it != cuda_allocs.end(); it++) {
      if (it->ptr == ptr && it->gpu_index == gpu_index) {
        found = true;
        if (cache_size + it->size < (MAX_CACHE_SIZE)) {
          cache[stream][it->size].push_back(*it);
          cache_size += it->size;
          if (peak_cache_size < cache_size) {
            peak_cache_size = cache_size;
          }
        } else {
          cuda_freed.push_back(*it);
          must_free = true;
        }
 #ifdef DEBUG_MEMORY_MANAGER
        printf("cuda dropped buffer %p of size %lu on gpu %d\n", ptr, it->size,
               gpu_index);
 #endif
        cuda_allocs.erase(it++);
      }
    }
    if (must_free) {
      cuda_set_device(gpu_index);
      if (alloc_type == SYNC) {
        check_cuda_error(cudaFree(ptr));
      } else if (alloc_type == ASYNC) {
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
 #elif (CUDART_VERSION >= 11020)
        int support_async_alloc;
        check_cuda_error(cudaDeviceGetAttribute(
            &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
        if (support_async_alloc) {
          check_cuda_error(cudaFreeAsync(ptr, stream));
        } else {
          check_cuda_error(cudaFree(ptr));
        }
 #else
        check_cuda_error(cudaFree(ptr));
 #endif
      }
    }
 #ifdef DEBUG_MEMORY_MANAGER
    if (!found) {
      for (auto it = cuda_freed.begin(); it != cuda_freed.end(); it++) {
        if (it->ptr == ptr && it->gpu_index == gpu_index) {
          found = true;
          printf("Drop in %s: %d\n", file, line);
          printf("Alloc in %s\n", it->usages[0].location.c_str());
          PANIC("cuda drop already dropped buffer %p of size %lu on gpu %d\n",
                ptr, it->size, gpu_index);
        }
      }
    }
    if (!found) {
      PANIC("cuda drop unknown buffer %p\n", ptr);
    }
 #endif
  }
  ~CudaMemoryManager() {
 #ifdef DEBUG_MEMORY_MANAGER
    printf("%lu ALLOCATIONS AT PROGRAM EXIT\n", cuda_allocs.size());
    for (auto &cuda_alloc : cuda_allocs) {
      printf("%p of size %lu allocated at %s\n", cuda_alloc.ptr,
             cuda_alloc.size, cuda_alloc.usages[0].location.c_str());
    }
    printf("\n\n\n %llu PEAK CACHE SIZE\n", peak_cache_size);
    for (auto &cache_for_size : cache) {
      for (auto &cuda_alloc : cache_for_size.second) {
        printf("%p of size %lu cached at %s\n", cuda_alloc.ptr, cuda_alloc.size,
               cuda_alloc.usages[0].location.c_str());
      }
    }
 #endif
  }
 };
 class CudaMultiGPUMemoryManager {
  std::unordered_map<uint32_t, CudaMemoryManager> gMemManagers;
  std::mutex gMemManagersMutex; // for creation of the mem managers
  std::atomic<uint32_t> gMemManagerExists = 0;
 public:
  CudaMemoryManager &get(uint32_t gpu_index) {
    if (gMemManagerExists.load() & (1 << gpu_index)) {
      return gMemManagers[gpu_index];
    } else {
      std::lock_guard<std::mutex> guard(gMemManagersMutex);
      uint32_t exist_flags = gMemManagerExists.load();
      if (!(exist_flags & (1 << gpu_index))) {
        gMemManagers[gpu_index]; // create it
        gMemManagerExists.store(exist_flags | (1 << gpu_index));
      }
      return gMemManagers[gpu_index];
    }
  }
 };
 CudaMultiGPUMemoryManager gCudaMemoryManager;
 #endif
 cudaEvent_t cuda_create_event(uint32_t gpu_index) {
  cuda_set_device(gpu_index);
  cudaEvent_t event;
@@ -221,24 +560,38 @@ uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }
 /// or if there's not enough memory. A safe wrapper around it must call
 /// cuda_check_valid_malloc() first
 void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
  void *ptr = nullptr;
 #ifdef USE_MEMORY_MANAGER
  gCudaMemoryManager.get(gpu_index).alloc(&ptr, size, SYNC, gpu_index, 0,
                                          "rust_code", 0);
 #else
  cuda_set_device(gpu_index);
  void *ptr;
  check_cuda_error(cudaMalloc((void **)&ptr, size));
-
+#endif
  return ptr;
 }
 void *cuda_ext_malloc(uint64_t size, uint32_t gpu_index) {
  return cuda_malloc(size, gpu_index);
 }
 /// Allocates a size-byte array at the device memory. Tries to do it
 /// asynchronously.
-void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
+void *cuda_intern_malloc_with_size_tracking_async(uint64_t size,
-                                           uint32_t gpu_index,
+                                                  cudaStream_t stream,
-                                           uint64_t &size_tracker,
+                                                  uint32_t gpu_index,
-                                           bool allocate_gpu_memory) {
+                                                  uint64_t &size_tracker,
                                                  bool allocate_gpu_memory,
                                                  const char *file, int line) {
  size_tracker += size;
  void *ptr = nullptr;
  if (!allocate_gpu_memory)
    return ptr;
 #ifdef USE_MEMORY_MANAGER
  gCudaMemoryManager.get(gpu_index).alloc(&ptr, size, ASYNC, gpu_index, stream,
                                          file, line);
 #else
  cuda_set_device(gpu_index);
 #ifndef CUDART_VERSION
@@ -256,16 +609,23 @@ void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
 #else
  check_cuda_error(cudaMalloc((void **)&ptr, size));
 #endif
 #endif
  return ptr;
 }
 /// Allocates a size-byte array at the device memory. Tries to do it
 /// asynchronously.
-void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
+void *cuda_int_malloc_async(uint64_t size, cudaStream_t stream,
-                        uint32_t gpu_index) {
+                            uint32_t gpu_index, const char *file, int line) {
  uint64_t size_tracker = 0;
-  return cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
+  return cuda_intern_malloc_with_size_tracking_async(
-                                              size_tracker, true);
+      size, stream, gpu_index, size_tracker, true, file, line);
 }
 void *cuda_ext_malloc_async(uint64_t size, cudaStream_t stream,
                            uint32_t gpu_index) {
  return cuda_malloc_async(size, stream, gpu_index);
 }
 /// Check that allocation is valid
@@ -340,6 +700,11 @@ void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
                                              gpu_index, true);
 }
 void cuda_ext_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
                                  cudaStream_t stream, uint32_t gpu_index) {
  cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
 }
 /// Copy memory within a GPU asynchronously
 void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
    void *dest, void const *src, uint64_t size, cudaStream_t stream,
@@ -372,6 +737,12 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                                                  gpu_index, true);
 }
 void cuda_ext_memcpy_async_gpu_to_gpu(void *dest, void const *src,
                                      uint64_t size, cudaStream_t stream,
                                      uint32_t gpu_index) {
  cuda_memcpy_async_gpu_to_gpu(dest, src, size, stream, gpu_index);
 }
 /// Copy memory within a GPU
 void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                            uint32_t gpu_index) {
@@ -396,6 +767,11 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
  }
 }
 void cuda_ext_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                                uint32_t gpu_index) {
  cuda_memcpy_gpu_to_gpu(dest, src, size, gpu_index);
 }
 /// Synchronizes device
 void cuda_synchronize_device(uint32_t gpu_index) {
  cuda_set_device(gpu_index);
@@ -408,6 +784,7 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
                                          bool gpu_memory_allocated) {
  if (size == 0 || !gpu_memory_allocated)
    return;
  cudaPointerAttributes attr;
  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
@@ -415,6 +792,7 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
  }
  cuda_set_device(gpu_index);
  check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
  gCudaMemoryManager.get(gpu_index).memset((int8_t *)dest, size, stream, "", 0);
 }
 void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
@@ -423,6 +801,11 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
                                       true);
 }
 void cuda_ext_memset_async(void *dest, uint64_t val, uint64_t size,
                           cudaStream_t stream, uint32_t gpu_index) {
  cuda_memset_async(dest, val, size, stream, gpu_index);
 }
 template <typename Torus>
 __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
@@ -474,6 +857,11 @@ void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
 }
 void cuda_ext_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                                  cudaStream_t stream, uint32_t gpu_index) {
  cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
 }
 /// Return number of GPUs available
 int cuda_get_number_of_gpus() {
  int num_gpus;
@@ -489,19 +877,31 @@ int cuda_get_number_of_sms() {
 }
 /// Drop a cuda array
-void cuda_drop(void *ptr, uint32_t gpu_index) {
+void cuda_int_drop(void *ptr, uint32_t gpu_index, const char *file, int line) {
 #ifdef USE_MEMORY_MANAGER
  gCudaMemoryManager.get(gpu_index).free(ptr, SYNC, gpu_index, 0, file, line);
 #else
  cuda_set_device(gpu_index);
  check_cuda_error(cudaFree(ptr));
 #endif
 }
 void cuda_ext_drop(void *ptr, uint32_t gpu_index) { cuda_drop(ptr, gpu_index); }
 /// Drop a cuda array asynchronously, if the data was allocated & it's supported
 /// on the device
-void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
+void cuda_int_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
-                                        uint32_t gpu_index,
+                                            uint32_t gpu_index,
-                                        bool gpu_memory_allocated) {
+                                            bool gpu_memory_allocated,
                                            const char *file, int line) {
  if (!gpu_memory_allocated)
    return;
 #ifdef USE_MEMORY_MANAGER
  gCudaMemoryManager.get(gpu_index).free(ptr, ASYNC, gpu_index, stream, file,
                                         line);
 #else
  cuda_set_device(gpu_index);
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
@@ -518,11 +918,14 @@ void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
 #else
  check_cuda_error(cudaFree(ptr));
 #endif
 #endif
 }
 /// Drop a cuda array asynchronously, if supported on the device
-void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
+void cuda_int_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index,
-  cuda_drop_with_size_tracking_async(ptr, stream, gpu_index, true);
+                         const char *file, int line) {
  cuda_int_drop_with_size_tracking_async(ptr, stream, gpu_index, true, file,
                                         line);
 }
 /// Get the maximum size for the shared memory per streaming multiprocessors
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -11,21 +11,13 @@ extern "C" {
    pub fn cuda_is_available() -> u32;
-    pub fn cuda_malloc(size: u64, gpu_index: u32) -> *mut c_void;
+    pub fn cuda_ext_malloc(size: u64, gpu_index: u32) -> *mut c_void;
-    pub fn cuda_malloc_with_size_tracking_async(
+    pub fn cuda_ext_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void;
        size: u64,
        stream: *mut c_void,
        gpu_index: u32,
        size_tracker: *mut u64,
        allocate_gpu_memory: bool,
    ) -> *mut c_void;
    pub fn cuda_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void;
    pub fn cuda_check_valid_malloc(size: u64, gpu_index: u32) -> bool;
    pub fn cuda_device_total_memory(gpu_index: u32) -> u64;
-    pub fn cuda_memcpy_with_size_tracking_async_to_gpu(
+    pub fn cuda_ext_memcpy_with_size_tracking_async_to_gpu(
        dest: *mut c_void,
        src: *const c_void,
        size: u64,
@@ -34,7 +26,7 @@ extern "C" {
        gpu_memory_allocated: bool,
    );
-    pub fn cuda_memcpy_async_to_gpu(
+    pub fn cuda_ext_memcpy_async_to_gpu(
        dest: *mut c_void,
        src: *const c_void,
        size: u64,
@@ -42,9 +34,14 @@ extern "C" {
        gpu_index: u32,
    );
-    pub fn cuda_memcpy_gpu_to_gpu(dest: *mut c_void, src: *const c_void, size: u64, gpu_index: u32);
+    pub fn cuda_ext_memcpy_gpu_to_gpu(
        dest: *mut c_void,
        src: *const c_void,
        size: u64,
        gpu_index: u32,
    );
-    pub fn cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+    pub fn cuda_ext_memcpy_with_size_tracking_async_gpu_to_gpu(
        dest: *mut c_void,
        src: *const c_void,
        size: u64,
@@ -53,7 +50,7 @@ extern "C" {
        gpu_memory_allocated: bool,
    );
-    pub fn cuda_memcpy_async_gpu_to_gpu(
+    pub fn cuda_ext_memcpy_async_gpu_to_gpu(
        dest: *mut c_void,
        src: *const c_void,
        size: u64,
@@ -61,7 +58,7 @@ extern "C" {
        gpu_index: u32,
    );
-    pub fn cuda_memcpy_async_to_cpu(
+    pub fn cuda_ext_memcpy_async_to_cpu(
        dest: *mut c_void,
        src: *const c_void,
        size: u64,
@@ -78,7 +75,7 @@ extern "C" {
        gpu_memory_allocated: bool,
    );
-    pub fn cuda_memset_async(
+    pub fn cuda_ext_memset_async(
        dest: *mut c_void,
        val: u64,
        size: u64,
@@ -92,7 +89,7 @@ extern "C" {
    pub fn cuda_synchronize_device(gpu_index: u32);
-    pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);
+    pub fn cuda_ext_drop(ptr: *mut c_void, gpu_index: u32);
    pub fn cuda_drop_with_size_tracking_async(
        ptr: *mut c_void,
@@ -102,7 +99,7 @@ extern "C" {
        allocate_gpu_memory: bool,
    );
-    pub fn cuda_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32);
+    pub fn cuda_ext_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32);
    pub fn cuda_setup_multi_gpu(gpu_index: u32) -> i32;
--- a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
@@ -4,7 +4,7 @@ use crate::core_crypto::prelude::{
    CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList,
    LweDimension, LweSize, UnsignedInteger,
 };
-use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_gpu_to_gpu;
+use tfhe_cuda_backend::cuda_bind::cuda_ext_memcpy_async_gpu_to_gpu;
 /// A structure representing a vector of LWE ciphertexts with 64 bits of precision on the GPU.
 #[derive(Clone, Debug)]
@@ -123,7 +123,7 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
            * std::mem::size_of::<T>();
        // Concatenate gpu_index memory
        unsafe {
-            cuda_memcpy_async_gpu_to_gpu(
+            cuda_ext_memcpy_async_gpu_to_gpu(
                ptr,
                first_item.0.d_vec.as_c_ptr(0),
                size as u64,
@@ -132,7 +132,7 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
            );
            ptr = ptr.wrapping_byte_add(size);
            for list in cuda_ciphertexts_list_vec {
-                cuda_memcpy_async_gpu_to_gpu(
+                cuda_ext_memcpy_async_gpu_to_gpu(
                    ptr,
                    list.0.d_vec.as_c_ptr(0),
                    size as u64,
--- a/tfhe/src/core_crypto/gpu/slice.rs
+++ b/tfhe/src/core_crypto/gpu/slice.rs
@@ -3,7 +3,9 @@ use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::Numeric;
 use std::ffi::c_void;
 use std::marker::PhantomData;
-use tfhe_cuda_backend::cuda_bind::{cuda_memcpy_async_gpu_to_gpu, cuda_memcpy_async_to_cpu};
+use tfhe_cuda_backend::cuda_bind::{
    cuda_ext_memcpy_async_gpu_to_gpu, cuda_ext_memcpy_async_to_cpu,
 };
 #[derive(Debug, Clone)]
 pub struct CudaSlice<'a, T: Numeric> {
@@ -98,7 +100,7 @@ where
        let size = src.len(index) * std::mem::size_of::<T>();
        // We check that src is not empty to avoid invalid pointers
        if size > 0 {
-            cuda_memcpy_async_gpu_to_gpu(
+            cuda_ext_memcpy_async_gpu_to_gpu(
                self.as_mut_c_ptr(index),
                src.as_c_ptr(index),
                size as u64,
@@ -123,7 +125,7 @@ where
        let size = self.len(index) * std::mem::size_of::<T>();
        // We check that src is not empty to avoid invalid pointers
        if size > 0 {
-            cuda_memcpy_async_to_cpu(
+            cuda_ext_memcpy_async_to_cpu(
                dest.as_mut_ptr().cast::<c_void>(),
                self.as_c_ptr(index),
                size as u64,
--- a/tfhe/src/core_crypto/gpu/vec.rs
+++ b/tfhe/src/core_crypto/gpu/vec.rs
@@ -6,9 +6,9 @@ use std::collections::Bound::{Excluded, Included, Unbounded};
 use std::ffi::c_void;
 use std::marker::PhantomData;
 use tfhe_cuda_backend::cuda_bind::{
-    cuda_drop, cuda_malloc, cuda_malloc_async, cuda_memcpy_async_gpu_to_gpu,
+    cuda_ext_drop, cuda_ext_malloc, cuda_ext_malloc_async, cuda_ext_memcpy_async_gpu_to_gpu,
-    cuda_memcpy_async_to_cpu, cuda_memcpy_async_to_gpu, cuda_memcpy_gpu_to_gpu, cuda_memset_async,
+    cuda_ext_memcpy_async_to_cpu, cuda_ext_memcpy_async_to_gpu, cuda_ext_memcpy_gpu_to_gpu,
-    cuda_synchronize_device,
+    cuda_ext_memset_async, cuda_synchronize_device,
 };
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -74,8 +74,8 @@ impl<T: Numeric> Clone for CudaVec<T> {
        for (index, &gpu_index) in self.gpu_indexes.iter().enumerate() {
            unsafe {
                cuda_synchronize_device(gpu_index.0);
-                let ptr = cuda_malloc(size, gpu_index.0);
+                let ptr = cuda_ext_malloc(size, gpu_index.0);
-                cuda_memcpy_gpu_to_gpu(ptr, self.ptr[index], size, gpu_index.0);
+                cuda_ext_memcpy_gpu_to_gpu(ptr, self.ptr[index], size, gpu_index.0);
                cloned_vec.push(ptr);
            }
        }
@@ -101,12 +101,12 @@ impl<T: Numeric> CudaVec<T> {
    /// - `streams` __must__ be synchronized to guarantee computation has finished
    pub unsafe fn new_async(len: usize, streams: &CudaStreams, stream_index: u32) -> Self {
        let size = len as u64 * std::mem::size_of::<T>() as u64;
-        let ptr = cuda_malloc_async(
+        let ptr = cuda_ext_malloc_async(
            size,
            streams.ptr[stream_index as usize],
            streams.gpu_indexes[stream_index as usize].0,
        );
-        cuda_memset_async(
+        cuda_ext_memset_async(
            ptr,
            0u64,
            size,
@@ -129,9 +129,9 @@ impl<T: Numeric> CudaVec<T> {
        let mut ptrs = Vec::with_capacity(streams.len());
        for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
            let gpu_index = streams.gpu_indexes[i];
-            let ptr = unsafe { cuda_malloc_async(size, stream_ptr, gpu_index.0) };
+            let ptr = unsafe { cuda_ext_malloc_async(size, stream_ptr, gpu_index.0) };
            unsafe {
-                cuda_memset_async(ptr, 0u64, size, stream_ptr, gpu_index.0);
+                cuda_ext_memset_async(ptr, 0u64, size, stream_ptr, gpu_index.0);
            }
            streams.synchronize_one(i as u32);
            ptrs.push(ptr);
@@ -179,7 +179,7 @@ impl<T: Numeric> CudaVec<T> {
        let size = self.len() * std::mem::size_of::<T>();
        // We check that self is not empty to avoid invalid pointers
        if size > 0 {
-            cuda_memset_async(
+            cuda_ext_memset_async(
                self.as_mut_c_ptr(stream_index),
                value,
                size as u64,
@@ -209,7 +209,7 @@ impl<T: Numeric> CudaVec<T> {
        // We have to check that src is not empty, because Rust slice with size 0 results in an
        // invalid pointer being passed to copy_to_gpu_async
        if size > 0 {
-            cuda_memcpy_async_to_gpu(
+            cuda_ext_memcpy_async_to_gpu(
                self.as_mut_c_ptr(stream_index),
                src.as_ptr().cast(),
                size as u64,
@@ -237,7 +237,7 @@ impl<T: Numeric> CudaVec<T> {
            // invalid pointer being passed to copy_to_gpu_async
            if size > 0 {
                let gpu_index = streams.gpu_indexes[i];
-                cuda_memcpy_async_to_gpu(
+                cuda_ext_memcpy_async_to_gpu(
                    self.get_mut_c_ptr(i as u32),
                    src.as_ptr().cast(),
                    size as u64,
@@ -266,7 +266,7 @@ impl<T: Numeric> CudaVec<T> {
        let size = src.len() * std::mem::size_of::<T>();
        // We check that src is not empty to avoid invalid pointers
        if size > 0 {
-            cuda_memcpy_async_gpu_to_gpu(
+            cuda_ext_memcpy_async_gpu_to_gpu(
                self.as_mut_c_ptr(stream_index),
                src.as_c_ptr(stream_index),
                size as u64,
@@ -305,7 +305,7 @@ impl<T: Numeric> CudaVec<T> {
            .as_c_ptr(stream_index)
            .add(start * std::mem::size_of::<T>());
        let size = (end - start + 1) * std::mem::size_of::<T>();
-        cuda_memcpy_async_gpu_to_gpu(
+        cuda_ext_memcpy_async_gpu_to_gpu(
            self.as_mut_c_ptr(stream_index),
            src_ptr,
            size as u64,
@@ -342,7 +342,7 @@ impl<T: Numeric> CudaVec<T> {
            .as_mut_c_ptr(stream_index)
            .add(start * std::mem::size_of::<T>());
        let size = (end - start + 1) * std::mem::size_of::<T>();
-        cuda_memcpy_async_gpu_to_gpu(
+        cuda_ext_memcpy_async_gpu_to_gpu(
            dest_ptr,
            src.as_c_ptr(stream_index),
            size as u64,
@@ -366,7 +366,7 @@ impl<T: Numeric> CudaVec<T> {
        // We have to check that self is not empty, because Rust slice with size 0 results in an
        // invalid pointer being passed to copy_to_cpu_async
        if size > 0 {
-            cuda_memcpy_async_to_cpu(
+            cuda_ext_memcpy_async_to_cpu(
                dest.as_mut_ptr().cast(),
                self.as_c_ptr(stream_index),
                size as u64,
@@ -484,7 +484,7 @@ impl<T: Numeric> Drop for CudaVec<T> {
            // Synchronizes the device to be sure no stream is still using this pointer
            let gpu_index = self.gpu_indexes[i];
            synchronize_device(gpu_index.0);
-            unsafe { cuda_drop(ptr, gpu_index.0) };
+            unsafe { cuda_ext_drop(ptr, gpu_index.0) };
        }
    }
 }
--- a/tfhe/src/integer/gpu/ciphertext/compact_list.rs
+++ b/tfhe/src/integer/gpu/ciphertext/compact_list.rs
@@ -20,7 +20,7 @@ use crate::shortint::{AtomicPatternKind, CarryModulus, Ciphertext, MessageModulu
 use crate::GpuIndex;
 use itertools::Itertools;
 use serde::Deserializer;
-use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_to_gpu;
+use tfhe_cuda_backend::cuda_bind::cuda_ext_memcpy_async_to_gpu;
 #[derive(Clone)]
 pub struct CudaCompactCiphertextListInfo {
@@ -271,7 +271,7 @@ impl CudaFlattenedVecCompactCiphertextList {
                let dest_ptr = d_flattened_d_vec
                    .as_mut_c_ptr(0)
                    .add(offset * std::mem::size_of::<u64>());
-                cuda_memcpy_async_to_gpu(
+                cuda_ext_memcpy_async_to_gpu(
                    dest_ptr,
                    container.as_ptr().cast(),
                    (expected_length * std::mem::size_of::<u64>()) as u64,
--- a/tfhe/src/integer/gpu/list_compression/server_keys.rs
+++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs
@@ -15,7 +15,7 @@ use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphe
 use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    compress_integer_radix_async, cuda_memcpy_async_gpu_to_gpu, decompress_integer_radix_async_64,
+    compress_integer_radix_async, cuda_ext_memcpy_async_gpu_to_gpu, decompress_integer_radix_async_64,
    get_compression_size_on_gpu, get_decompression_size_on_gpu,
 };
 use crate::prelude::CastInto;
@@ -248,7 +248,7 @@ impl CudaCompressionKey {
                .as_mut_c_ptr(0)
                .add(offset * std::mem::size_of::<u64>());
            let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::<u64>();
-            cuda_memcpy_async_gpu_to_gpu(
+            cuda_ext_memcpy_async_gpu_to_gpu(
                dest_ptr,
                ciphertext.d_blocks.0.d_vec.as_c_ptr(0),
                size as u64,
@@ -577,7 +577,7 @@ impl CudaNoiseSquashingCompressionKey {
                .as_mut_c_ptr(0)
                .add(offset * std::mem::size_of::<u128>());
            let size = ciphertext.packed_d_blocks.0.d_vec.len * std::mem::size_of::<u128>();
-            cuda_memcpy_async_gpu_to_gpu(
+            cuda_ext_memcpy_async_gpu_to_gpu(
                dest_ptr,
                ciphertext.packed_d_blocks.0.d_vec.as_c_ptr(0),
                size as u64,