feat(gpu): add memory management

2026-01-07 22:04:10 -05:00 · 2025-06-30 20:28:13 +04:00
parent 70fa68bf52
commit 846eed184e
9 changed files with 530 additions and 74 deletions
--- a/13
+++ b/13
@@ -994,6 +994,19 @@ test_high_level_api: install_rs_build_toolchain
 		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p tfhe \
 		-- high_level_api::

+test_high_level_api_gpu_one: install_rs_build_toolchain install_cargo_nextest
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test  \
+		--features=integer,internal-keycache,gpu,zk-pok -p tfhe \
+		-- --nocapture high_level_api::array::tests::booleans::test_gpu_only_bitand
+
+#
+test_high_level_api_gpu_mul:  install_rs_build_toolchain install_cargo_nextest
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN)  test  \
+        --features=integer,internal-keycache,gpu-debug --profile release \
+        -p tfhe  \
+        -- --nocapture integer::gpu::server_key::radix::tests_unsigned::test_mul:: \
+        --test-threads=6
+
 test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
 		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -5,6 +5,8 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cuda_runtime.h>
+#include <fstream>
+#include <vector>

 #define CUDA_STREAM_POOL

@@ -83,13 +85,27 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);
 uint32_t cuda_is_available();

 void *cuda_malloc(uint64_t size, uint32_t gpu_index);
+void *cuda_ext_malloc(uint64_t size, uint32_t gpu_index);

-void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
-                                           uint32_t gpu_index,
-                                           uint64_t &size_tracker,
-                                           bool allocate_gpu_memory);
+void *cuda_intern_malloc_with_size_tracking_async(uint64_t size,
+                                                  cudaStream_t stream,
+                                                  uint32_t gpu_index,
+                                                  uint64_t &size_tracker,
+                                                  bool allocate_gpu_memory,
+                                                  const char *file, int line);

-void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
+#define cuda_malloc_with_size_tracking_async(                                  \
+    size, stream, gpu_index, size_tracker, allocate_gpu_memory)                \
+  cuda_intern_malloc_with_size_tracking_async(                                 \
+      size, stream, gpu_index, size_tracker, allocate_gpu_memory, __FILE__,    \
+      __LINE__)
+
+void *cuda_int_malloc_async(uint64_t size, cudaStream_t stream,
+                            uint32_t gpu_index, const char *file, int line);
+#define cuda_malloc_async(size, stream, gpu_index)                             \
+  cuda_int_malloc_async(size, stream, gpu_index, __FILE__, __LINE__)
+void *cuda_ext_malloc_async(uint64_t size, cudaStream_t stream,
+                            uint32_t gpu_index);

 bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
 uint64_t cuda_device_total_memory(uint32_t gpu_index);
@@ -103,18 +119,28 @@ void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
 void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);

+void cuda_ext_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
+                                  cudaStream_t stream, uint32_t gpu_index);
+
 void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
    void *dest, void const *src, uint64_t size, cudaStream_t stream,
    uint32_t gpu_index, bool gpu_memory_allocated);

 void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                                  cudaStream_t stream, uint32_t gpu_index);
+void cuda_ext_memcpy_async_gpu_to_gpu(void *dest, void const *src,
+                                      uint64_t size, cudaStream_t stream,
+                                      uint32_t gpu_index);

 void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                            uint32_t gpu_index);
+void cuda_ext_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
+                                uint32_t gpu_index);

 void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);
+void cuda_ext_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
+                                  cudaStream_t stream, uint32_t gpu_index);

 void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
                                          uint64_t size, cudaStream_t stream,
@@ -123,6 +149,8 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,

 void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
                       cudaStream_t stream, uint32_t gpu_index);
+void cuda_ext_memset_async(void *dest, uint64_t val, uint64_t size,
+                           cudaStream_t stream, uint32_t gpu_index);

 int cuda_get_number_of_gpus();

@@ -130,13 +158,26 @@ int cuda_get_number_of_sms();

 void cuda_synchronize_device(uint32_t gpu_index);

-void cuda_drop(void *ptr, uint32_t gpu_index);
+void cuda_int_drop(void *ptr, uint32_t gpu_index, const char *file, int line);
+#define cuda_drop(ptr, gpu_index)                                              \
+  cuda_int_drop(ptr, gpu_index, __FILE__, __LINE__)

-void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
-                                        uint32_t gpu_index,
-                                        bool gpu_memory_allocated);
+void cuda_ext_drop(void *ptr, uint32_t gpu_index);

-void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
+void cuda_int_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
+                                            uint32_t gpu_index,
+                                            bool gpu_memory_allocated,
+                                            const char *file, int line);
+
+#define cuda_drop_with_size_tracking_async(ptr, stream, gpu_index,             \
+                                           gpu_memory_allocated)               \
+  cuda_int_drop_with_size_tracking_async(                                      \
+      ptr, stream, gpu_index, gpu_memory_allocated, __FILE__, __LINE__)
+
+void cuda_int_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index,
+                         const char *file, int line);
+#define cuda_drop_async(ptr, stream, gpu_index)                                \
+  cuda_int_drop_async(ptr, stream, gpu_index, __FILE__, __LINE__)
 }

 uint32_t cuda_get_max_shared_memory(uint32_t gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -3,18 +3,33 @@
 #include <atomic>
 #include <cstdint>
 #include <cuda_runtime.h>
+#include <deque>
+#include <unordered_map>
 #include <mutex>
+
 #ifdef USE_NVTOOLS
 #include <cuda_profiler_api.h>
 #endif

 #ifdef CUDA_STREAM_POOL
 #include <deque>
-#include <mutex>
 #include <vector>
 #include <unordered_map>
 #endif

+#include <bits/this_thread_sleep.h>
+
+#define USE_MEMORY_MANAGER
+// #define DEBUG_MEMORY_MANAGER
+#define MAX_CACHE_SIZE (1 << 30)
+
+#ifdef USE_MEMORY_MANAGER
+#include <list>
+#include <sstream>
+#include <string>
+#include <thread>
+#endif
+
 uint32_t cuda_get_device() {
  int device;
  check_cuda_error(cudaGetDevice(&device));
@@ -99,6 +114,330 @@ void cuda_set_device(uint32_t gpu_index) {
 #endif
 }

+#ifdef USE_MEMORY_MANAGER
+
+enum CudaMemBlockUsageType { CUDA_ALLOC = 0, MEMSET, MEMCPY_SRC, MEMCPY_DEST, FREE };
+
+enum CudaAllocType { SYNC = 0, ASYNC };
+
+#ifdef DEBUG_MEMORY_MANAGER
+struct CudaMemBlockUsage {
+  std::string location;
+  uint64_t timestamp;
+  CudaMemBlockUsageType type;
+};
+#endif
+
+struct CudaMemBlock {
+  int8_t *ptr;
+  uint64_t size;
+  cudaStream_t stream;
+  uint32_t gpu_index;
+  size_t thread_id;
+  CudaAllocType alloc_type;
+#ifdef DEBUG_MEMORY_MANAGER
+  std::vector<CudaMemBlockUsage> usages;
+#endif
+};
+
+class CudaMemoryManager {
+  std::list<CudaMemBlock> cuda_allocs; // fresh allocs
+  std::list<CudaMemBlock> cuda_freed;  // freed for good
+
+  std::unordered_map<cudaStream_t,
+                     std::unordered_map<uint64_t, std::deque<CudaMemBlock>>>
+      cache; // freed and re-used
+  uint64_t cache_size = 0, peak_cache_size = 0;
+
+  std::mutex allocs_mutex;
+
+#ifdef DEBUG_MEMORY_MANAGER
+  std::string make_location(const char *file, int line) {
+    std::stringstream sstr;
+    sstr << file << ":" << line;
+    return sstr.str();
+  }
+  uint64_t make_timestamp() {
+    const std::chrono::time_point<std::chrono::system_clock> now =
+        std::chrono::system_clock::now();
+
+    auto us = std::chrono::duration_cast<std::chrono::microseconds>(
+                  now.time_since_epoch())
+                  .count() %
+              1000000;
+    return us;
+  }
+
+  void check_range_is_valid(CudaMemBlockUsageType usage_type, int8_t *dest,
+                            uint64_t size, cudaStream_t stream,
+                            const char *file, int line) {
+    CudaMemBlockUsage usage = {make_location(file, line), make_timestamp(),
+                               usage_type};
+
+    const char *info = NULL;
+    switch (usage_type) {
+    case MEMSET:
+      info = "memset";
+      break;
+    case MEMCPY_SRC:
+      info = "memcpy source";
+      break;
+    case MEMCPY_DEST:
+      info = "memcpy dest";
+      break;
+    default:
+      info = "unknown";
+    }
+
+    auto device_id = cuda_get_device();
+
+    bool found = false;
+    for (auto it = cuda_allocs.begin(); it != cuda_allocs.end(); it++) {
+      if (it->ptr == dest && it->gpu_index == device_id) {
+        printf("%s with size tracking: found ptr %p\n", info, dest);
+        if (size > it->size) {
+          PANIC("%s OF %lu bytes TOO BIG TO %p OF SIZE %ld\n", info, size, dest,
+                it->size);
+        }
+        it->usages.push_back(usage);
+        found = true;
+      } else {
+        if (dest > it->ptr && dest < it->ptr + it->size &&
+            it->gpu_index == device_id) {
+          printf("%s with size tracking: indirect ptr %p in buffer %p\n", info,
+                 dest, it->ptr);
+          if (dest + size > it->ptr + it->size) {
+            auto remain_bytes = it->ptr + it->size - dest;
+            PANIC("%s OF %lu bytes TOO BIG TO %p WHICH HAS ROOM ONLY FOR %d\n",
+                  info, size, dest, remain_bytes);
+          }
+          it->usages.push_back(usage);
+          found = true;
+        }
+      }
+    }
+    if (!found) {
+      PANIC("Cuda %s to %p of size %lu, unknown pointer", info, dest, size);
+    }
+  }
+#endif
+
+public:
+  void alloc(void **ptr, uint64_t size, CudaAllocType alloc_type,
+             uint32_t gpu_index, cudaStream_t stream, const char *file,
+             int line) {
+    std::lock_guard<std::mutex> guard(allocs_mutex);
+
+    auto cache_of_stream = cache.find(stream);
+    if (cache_of_stream != cache.end()) {
+      auto cache_of_size = cache_of_stream->second.find(size);
+      if (cache_of_size != cache_of_stream->second.end() &&
+          !cache_of_size->second.empty()) {
+        auto cached_alloc = cache_of_size->second.front();
+        cache_of_size->second.pop_front();
+
+        // move to active allocs
+        cuda_allocs.push_back(cached_alloc);
+        *ptr = cached_alloc.ptr;
+
+        if (cache_size < size) {
+          PANIC("INVALID CACHE USE!!");
+        }
+
+        cache_size -= size;
+
+#ifdef DEBUG_MEMORY_MANAGER
+        printf("Cuda Allocation serviced from cache: %p of size %lu on gpu %d "
+               "in %s\n",
+               ptr, size, gpu_index, "");
+#endif
+        return;
+      }
+    }
+
+    cuda_set_device(gpu_index);
+    if (alloc_type == SYNC) {
+      check_cuda_error(cudaMalloc(ptr, size));
+    } else if (alloc_type == ASYNC) {
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION >= 11020)
+      int support_async_alloc;
+      check_cuda_error(cudaDeviceGetAttribute(
+          &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
+
+      if (support_async_alloc) {
+        check_cuda_error(cudaMallocAsync(ptr, size, stream));
+      } else {
+        check_cuda_error(cudaMalloc(ptr, size));
+      }
+#else
+      check_cuda_error(cudaMalloc((void **)&ptr, size));
+#endif
+    } else {
+      PANIC("Invalid allocation mode");
+    }
+
+    if (*ptr == nullptr) {
+      if (size > 0) {
+        PANIC("Allocation failed for %lu bytes, allocator returned %p", size,
+              ptr);
+      }
+      return;
+    }
+
+    auto thread_id = std::hash<std::thread::id>{}(std::this_thread::get_id());
+    CudaMemBlock block = {(int8_t *)*ptr, size,      stream,
+                          gpu_index,      thread_id, alloc_type};
+#ifdef DEBUG_MEMORY_MANAGER
+    CudaMemBlockUsage usage = {make_location(file, line), make_timestamp(),
+                               CUDA_ALLOC};
+    block.usages.push_back(usage);
+
+    printf("Cuda Allocated %p of size %lu on gpu %d in %s\n", ptr, size,
+           gpu_index, usage.location.c_str());
+#endif
+
+    cuda_allocs.push_back(block);
+  }
+  void memset(int8_t *dest, uint64_t size, cudaStream_t stream,
+              const char *file, int line) {
+#ifdef DEBUG_MEMORY_MANAGER
+    std::lock_guard<std::mutex> guard(allocs_mutex);
+
+    check_range_is_valid(MEMSET, dest, size, stream, file, line);
+#endif
+  }
+
+  void memcpy(int8_t *dest, int8_t *src, uint64_t size, cudaStream_t stream,
+              const char *file, int line) {
+#ifdef DEBUG_MEMORY_MANAGER
+    std::lock_guard<std::mutex> guard(allocs_mutex);
+
+    check_range_is_valid(MEMCPY_SRC, src, size, stream, file, line);
+    check_range_is_valid(MEMCPY_DEST, src, size, stream, file, line);
+#endif
+  }
+
+  void free(void *ptr, CudaAllocType alloc_type, uint32_t gpu_index,
+            cudaStream_t stream, const char *file, int line) {
+    if (ptr == nullptr)
+      return;
+
+    std::lock_guard<std::mutex> guard(allocs_mutex);
+
+    bool found = false;
+    bool must_free = false;
+
+    for (auto it = cuda_allocs.begin(); it != cuda_allocs.end(); it++) {
+      if (it->ptr == ptr && it->gpu_index == gpu_index) {
+        found = true;
+
+        if (cache_size + it->size < (MAX_CACHE_SIZE)) {
+          cache[stream][it->size].push_back(*it);
+          cache_size += it->size;
+          if (peak_cache_size < cache_size) {
+            peak_cache_size = cache_size;
+          }
+        } else {
+          cuda_freed.push_back(*it);
+          must_free = true;
+        }
+#ifdef DEBUG_MEMORY_MANAGER
+        printf("cuda dropped buffer %p of size %lu on gpu %d\n", ptr, it->size,
+               gpu_index);
+#endif
+        cuda_allocs.erase(it++);
+      }
+    }
+
+    if (must_free) {
+      cuda_set_device(gpu_index);
+      if (alloc_type == SYNC) {
+        check_cuda_error(cudaFree(ptr));
+      } else if (alloc_type == ASYNC) {
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION >= 11020)
+        int support_async_alloc;
+        check_cuda_error(cudaDeviceGetAttribute(
+            &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
+
+        if (support_async_alloc) {
+          check_cuda_error(cudaFreeAsync(ptr, stream));
+        } else {
+          check_cuda_error(cudaFree(ptr));
+        }
+#else
+        check_cuda_error(cudaFree(ptr));
+#endif
+      }
+    }
+
+#ifdef DEBUG_MEMORY_MANAGER
+    if (!found) {
+      for (auto it = cuda_freed.begin(); it != cuda_freed.end(); it++) {
+        if (it->ptr == ptr && it->gpu_index == gpu_index) {
+          found = true;
+          printf("Drop in %s: %d\n", file, line);
+          printf("Alloc in %s\n", it->usages[0].location.c_str());
+          PANIC("cuda drop already dropped buffer %p of size %lu on gpu %d\n",
+                ptr, it->size, gpu_index);
+        }
+      }
+    }
+
+    if (!found) {
+      PANIC("cuda drop unknown buffer %p\n", ptr);
+    }
+#endif
+  }
+
+  ~CudaMemoryManager() {
+#ifdef DEBUG_MEMORY_MANAGER
+    printf("%lu ALLOCATIONS AT PROGRAM EXIT\n", cuda_allocs.size());
+
+    for (auto &cuda_alloc : cuda_allocs) {
+      printf("%p of size %lu allocated at %s\n", cuda_alloc.ptr,
+             cuda_alloc.size, cuda_alloc.usages[0].location.c_str());
+    }
+
+    printf("\n\n\n %llu PEAK CACHE SIZE\n", peak_cache_size);
+
+    for (auto &cache_for_size : cache) {
+      for (auto &cuda_alloc : cache_for_size.second) {
+        printf("%p of size %lu cached at %s\n", cuda_alloc.ptr, cuda_alloc.size,
+               cuda_alloc.usages[0].location.c_str());
+      }
+    }
+#endif
+  }
+};
+
+class CudaMultiGPUMemoryManager {
+  std::unordered_map<uint32_t, CudaMemoryManager> gMemManagers;
+  std::mutex gMemManagersMutex; // for creation of the mem managers
+  std::atomic<uint32_t> gMemManagerExists = 0;
+
+public:
+  CudaMemoryManager &get(uint32_t gpu_index) {
+    if (gMemManagerExists.load() & (1 << gpu_index)) {
+      return gMemManagers[gpu_index];
+    } else {
+      std::lock_guard<std::mutex> guard(gMemManagersMutex);
+      uint32_t exist_flags = gMemManagerExists.load();
+      if (!(exist_flags & (1 << gpu_index))) {
+        gMemManagers[gpu_index]; // create it
+        gMemManagerExists.store(exist_flags | (1 << gpu_index));
+      }
+      return gMemManagers[gpu_index];
+    }
+  }
+};
+
+CudaMultiGPUMemoryManager gCudaMemoryManager;
+#endif
+
 cudaEvent_t cuda_create_event(uint32_t gpu_index) {
  cuda_set_device(gpu_index);
  cudaEvent_t event;
@@ -221,24 +560,38 @@ uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }
 /// or if there's not enough memory. A safe wrapper around it must call
 /// cuda_check_valid_malloc() first
 void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
+  void *ptr = nullptr;
+#ifdef USE_MEMORY_MANAGER
+  gCudaMemoryManager.get(gpu_index).alloc(&ptr, size, SYNC, gpu_index, 0,
+                                          "rust_code", 0);
+#else
  cuda_set_device(gpu_index);
-  void *ptr;
  check_cuda_error(cudaMalloc((void **)&ptr, size));
-
+#endif
  return ptr;
 }

+void *cuda_ext_malloc(uint64_t size, uint32_t gpu_index) {
+  return cuda_malloc(size, gpu_index);
+}
+
 /// Allocates a size-byte array at the device memory. Tries to do it
 /// asynchronously.
-void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
-                                           uint32_t gpu_index,
-                                           uint64_t &size_tracker,
-                                           bool allocate_gpu_memory) {
+void *cuda_intern_malloc_with_size_tracking_async(uint64_t size,
+                                                  cudaStream_t stream,
+                                                  uint32_t gpu_index,
+                                                  uint64_t &size_tracker,
+                                                  bool allocate_gpu_memory,
+                                                  const char *file, int line) {
  size_tracker += size;
  void *ptr = nullptr;
  if (!allocate_gpu_memory)
    return ptr;

+#ifdef USE_MEMORY_MANAGER
+  gCudaMemoryManager.get(gpu_index).alloc(&ptr, size, ASYNC, gpu_index, stream,
+                                          file, line);
+#else
  cuda_set_device(gpu_index);

 #ifndef CUDART_VERSION
@@ -256,16 +609,23 @@ void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
 #else
  check_cuda_error(cudaMalloc((void **)&ptr, size));
 #endif
+#endif
+
  return ptr;
 }

 /// Allocates a size-byte array at the device memory. Tries to do it
 /// asynchronously.
-void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
-                        uint32_t gpu_index) {
+void *cuda_int_malloc_async(uint64_t size, cudaStream_t stream,
+                            uint32_t gpu_index, const char *file, int line) {
  uint64_t size_tracker = 0;
-  return cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
-                                              size_tracker, true);
+  return cuda_intern_malloc_with_size_tracking_async(
+      size, stream, gpu_index, size_tracker, true, file, line);
+}
+
+void *cuda_ext_malloc_async(uint64_t size, cudaStream_t stream,
+                            uint32_t gpu_index) {
+  return cuda_malloc_async(size, stream, gpu_index);
 }

 /// Check that allocation is valid
@@ -340,6 +700,11 @@ void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
                                              gpu_index, true);
 }

+void cuda_ext_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
+                                  cudaStream_t stream, uint32_t gpu_index) {
+  cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
+}
+
 /// Copy memory within a GPU asynchronously
 void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
    void *dest, void const *src, uint64_t size, cudaStream_t stream,
@@ -372,6 +737,12 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                                                  gpu_index, true);
 }

+void cuda_ext_memcpy_async_gpu_to_gpu(void *dest, void const *src,
+                                      uint64_t size, cudaStream_t stream,
+                                      uint32_t gpu_index) {
+  cuda_memcpy_async_gpu_to_gpu(dest, src, size, stream, gpu_index);
+}
+
 /// Copy memory within a GPU
 void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                            uint32_t gpu_index) {
@@ -396,6 +767,11 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
  }
 }

+void cuda_ext_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
+                                uint32_t gpu_index) {
+  cuda_memcpy_gpu_to_gpu(dest, src, size, gpu_index);
+}
+
 /// Synchronizes device
 void cuda_synchronize_device(uint32_t gpu_index) {
  cuda_set_device(gpu_index);
@@ -408,6 +784,7 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
                                          bool gpu_memory_allocated) {
  if (size == 0 || !gpu_memory_allocated)
    return;
+
  cudaPointerAttributes attr;
  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
@@ -415,6 +792,7 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
  }
  cuda_set_device(gpu_index);
  check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
+  gCudaMemoryManager.get(gpu_index).memset((int8_t *)dest, size, stream, "", 0);
 }

 void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
@@ -423,6 +801,11 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
                                       true);
 }

+void cuda_ext_memset_async(void *dest, uint64_t val, uint64_t size,
+                           cudaStream_t stream, uint32_t gpu_index) {
+  cuda_memset_async(dest, val, size, stream, gpu_index);
+}
+
 template <typename Torus>
 __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
@@ -474,6 +857,11 @@ void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
 }

+void cuda_ext_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
+                                  cudaStream_t stream, uint32_t gpu_index) {
+  cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
+}
+
 /// Return number of GPUs available
 int cuda_get_number_of_gpus() {
  int num_gpus;
@@ -489,19 +877,31 @@ int cuda_get_number_of_sms() {
 }

 /// Drop a cuda array
-void cuda_drop(void *ptr, uint32_t gpu_index) {
+void cuda_int_drop(void *ptr, uint32_t gpu_index, const char *file, int line) {
+#ifdef USE_MEMORY_MANAGER
+  gCudaMemoryManager.get(gpu_index).free(ptr, SYNC, gpu_index, 0, file, line);
+#else
  cuda_set_device(gpu_index);
  check_cuda_error(cudaFree(ptr));
+#endif
 }

+void cuda_ext_drop(void *ptr, uint32_t gpu_index) { cuda_drop(ptr, gpu_index); }
+
 /// Drop a cuda array asynchronously, if the data was allocated & it's supported
 /// on the device
-void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
-                                        uint32_t gpu_index,
-                                        bool gpu_memory_allocated) {
+void cuda_int_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
+                                            uint32_t gpu_index,
+                                            bool gpu_memory_allocated,
+                                            const char *file, int line) {

  if (!gpu_memory_allocated)
    return;
+
+#ifdef USE_MEMORY_MANAGER
+  gCudaMemoryManager.get(gpu_index).free(ptr, ASYNC, gpu_index, stream, file,
+                                         line);
+#else
  cuda_set_device(gpu_index);
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
@@ -518,11 +918,14 @@ void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
 #else
  check_cuda_error(cudaFree(ptr));
 #endif
+#endif
 }

 /// Drop a cuda array asynchronously, if supported on the device
-void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
-  cuda_drop_with_size_tracking_async(ptr, stream, gpu_index, true);
+void cuda_int_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index,
+                         const char *file, int line) {
+  cuda_int_drop_with_size_tracking_async(ptr, stream, gpu_index, true, file,
+                                         line);
 }

 /// Get the maximum size for the shared memory per streaming multiprocessors
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -11,21 +11,13 @@ extern "C" {

    pub fn cuda_is_available() -> u32;

-    pub fn cuda_malloc(size: u64, gpu_index: u32) -> *mut c_void;
+    pub fn cuda_ext_malloc(size: u64, gpu_index: u32) -> *mut c_void;

-    pub fn cuda_malloc_with_size_tracking_async(
-        size: u64,
-        stream: *mut c_void,
-        gpu_index: u32,
-        size_tracker: *mut u64,
-        allocate_gpu_memory: bool,
-    ) -> *mut c_void;
-
-    pub fn cuda_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void;
+    pub fn cuda_ext_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void;
    pub fn cuda_check_valid_malloc(size: u64, gpu_index: u32) -> bool;
    pub fn cuda_device_total_memory(gpu_index: u32) -> u64;

-    pub fn cuda_memcpy_with_size_tracking_async_to_gpu(
+    pub fn cuda_ext_memcpy_with_size_tracking_async_to_gpu(
        dest: *mut c_void,
        src: *const c_void,
        size: u64,
@@ -34,7 +26,7 @@ extern "C" {
        gpu_memory_allocated: bool,
    );

-    pub fn cuda_memcpy_async_to_gpu(
+    pub fn cuda_ext_memcpy_async_to_gpu(
        dest: *mut c_void,
        src: *const c_void,
        size: u64,
@@ -42,9 +34,14 @@ extern "C" {
        gpu_index: u32,
    );

-    pub fn cuda_memcpy_gpu_to_gpu(dest: *mut c_void, src: *const c_void, size: u64, gpu_index: u32);
+    pub fn cuda_ext_memcpy_gpu_to_gpu(
+        dest: *mut c_void,
+        src: *const c_void,
+        size: u64,
+        gpu_index: u32,
+    );

-    pub fn cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+    pub fn cuda_ext_memcpy_with_size_tracking_async_gpu_to_gpu(
        dest: *mut c_void,
        src: *const c_void,
        size: u64,
@@ -53,7 +50,7 @@ extern "C" {
        gpu_memory_allocated: bool,
    );

-    pub fn cuda_memcpy_async_gpu_to_gpu(
+    pub fn cuda_ext_memcpy_async_gpu_to_gpu(
        dest: *mut c_void,
        src: *const c_void,
        size: u64,
@@ -61,7 +58,7 @@ extern "C" {
        gpu_index: u32,
    );

-    pub fn cuda_memcpy_async_to_cpu(
+    pub fn cuda_ext_memcpy_async_to_cpu(
        dest: *mut c_void,
        src: *const c_void,
        size: u64,
@@ -78,7 +75,7 @@ extern "C" {
        gpu_memory_allocated: bool,
    );

-    pub fn cuda_memset_async(
+    pub fn cuda_ext_memset_async(
        dest: *mut c_void,
        val: u64,
        size: u64,
@@ -92,7 +89,7 @@ extern "C" {

    pub fn cuda_synchronize_device(gpu_index: u32);

-    pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);
+    pub fn cuda_ext_drop(ptr: *mut c_void, gpu_index: u32);

    pub fn cuda_drop_with_size_tracking_async(
        ptr: *mut c_void,
@@ -102,7 +99,7 @@ extern "C" {
        allocate_gpu_memory: bool,
    );

-    pub fn cuda_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32);
+    pub fn cuda_ext_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32);

    pub fn cuda_setup_multi_gpu(gpu_index: u32) -> i32;

--- a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
@@ -4,7 +4,7 @@ use crate::core_crypto::prelude::{
    CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList,
    LweDimension, LweSize, UnsignedInteger,
 };
-use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_gpu_to_gpu;
+use tfhe_cuda_backend::cuda_bind::cuda_ext_memcpy_async_gpu_to_gpu;

 /// A structure representing a vector of LWE ciphertexts with 64 bits of precision on the GPU.
 #[derive(Clone, Debug)]
@@ -123,7 +123,7 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
            * std::mem::size_of::<T>();
        // Concatenate gpu_index memory
        unsafe {
-            cuda_memcpy_async_gpu_to_gpu(
+            cuda_ext_memcpy_async_gpu_to_gpu(
                ptr,
                first_item.0.d_vec.as_c_ptr(0),
                size as u64,
@@ -132,7 +132,7 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
            );
            ptr = ptr.wrapping_byte_add(size);
            for list in cuda_ciphertexts_list_vec {
-                cuda_memcpy_async_gpu_to_gpu(
+                cuda_ext_memcpy_async_gpu_to_gpu(
                    ptr,
                    list.0.d_vec.as_c_ptr(0),
                    size as u64,
--- a/tfhe/src/core_crypto/gpu/slice.rs
+++ b/tfhe/src/core_crypto/gpu/slice.rs
@@ -3,7 +3,9 @@ use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::Numeric;
 use std::ffi::c_void;
 use std::marker::PhantomData;
-use tfhe_cuda_backend::cuda_bind::{cuda_memcpy_async_gpu_to_gpu, cuda_memcpy_async_to_cpu};
+use tfhe_cuda_backend::cuda_bind::{
+    cuda_ext_memcpy_async_gpu_to_gpu, cuda_ext_memcpy_async_to_cpu,
+};

 #[derive(Debug, Clone)]
 pub struct CudaSlice<'a, T: Numeric> {
@@ -98,7 +100,7 @@ where
        let size = src.len(index) * std::mem::size_of::<T>();
        // We check that src is not empty to avoid invalid pointers
        if size > 0 {
-            cuda_memcpy_async_gpu_to_gpu(
+            cuda_ext_memcpy_async_gpu_to_gpu(
                self.as_mut_c_ptr(index),
                src.as_c_ptr(index),
                size as u64,
@@ -123,7 +125,7 @@ where
        let size = self.len(index) * std::mem::size_of::<T>();
        // We check that src is not empty to avoid invalid pointers
        if size > 0 {
-            cuda_memcpy_async_to_cpu(
+            cuda_ext_memcpy_async_to_cpu(
                dest.as_mut_ptr().cast::<c_void>(),
                self.as_c_ptr(index),
                size as u64,
--- a/tfhe/src/core_crypto/gpu/vec.rs
+++ b/tfhe/src/core_crypto/gpu/vec.rs
@@ -6,9 +6,9 @@ use std::collections::Bound::{Excluded, Included, Unbounded};
 use std::ffi::c_void;
 use std::marker::PhantomData;
 use tfhe_cuda_backend::cuda_bind::{
-    cuda_drop, cuda_malloc, cuda_malloc_async, cuda_memcpy_async_gpu_to_gpu,
-    cuda_memcpy_async_to_cpu, cuda_memcpy_async_to_gpu, cuda_memcpy_gpu_to_gpu, cuda_memset_async,
-    cuda_synchronize_device,
+    cuda_ext_drop, cuda_ext_malloc, cuda_ext_malloc_async, cuda_ext_memcpy_async_gpu_to_gpu,
+    cuda_ext_memcpy_async_to_cpu, cuda_ext_memcpy_async_to_gpu, cuda_ext_memcpy_gpu_to_gpu,
+    cuda_ext_memset_async, cuda_synchronize_device,
 };

 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -74,8 +74,8 @@ impl<T: Numeric> Clone for CudaVec<T> {
        for (index, &gpu_index) in self.gpu_indexes.iter().enumerate() {
            unsafe {
                cuda_synchronize_device(gpu_index.0);
-                let ptr = cuda_malloc(size, gpu_index.0);
-                cuda_memcpy_gpu_to_gpu(ptr, self.ptr[index], size, gpu_index.0);
+                let ptr = cuda_ext_malloc(size, gpu_index.0);
+                cuda_ext_memcpy_gpu_to_gpu(ptr, self.ptr[index], size, gpu_index.0);
                cloned_vec.push(ptr);
            }
        }
@@ -101,12 +101,12 @@ impl<T: Numeric> CudaVec<T> {
    /// - `streams` __must__ be synchronized to guarantee computation has finished
    pub unsafe fn new_async(len: usize, streams: &CudaStreams, stream_index: u32) -> Self {
        let size = len as u64 * std::mem::size_of::<T>() as u64;
-        let ptr = cuda_malloc_async(
+        let ptr = cuda_ext_malloc_async(
            size,
            streams.ptr[stream_index as usize],
            streams.gpu_indexes[stream_index as usize].0,
        );
-        cuda_memset_async(
+        cuda_ext_memset_async(
            ptr,
            0u64,
            size,
@@ -129,9 +129,9 @@ impl<T: Numeric> CudaVec<T> {
        let mut ptrs = Vec::with_capacity(streams.len());
        for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
            let gpu_index = streams.gpu_indexes[i];
-            let ptr = unsafe { cuda_malloc_async(size, stream_ptr, gpu_index.0) };
+            let ptr = unsafe { cuda_ext_malloc_async(size, stream_ptr, gpu_index.0) };
            unsafe {
-                cuda_memset_async(ptr, 0u64, size, stream_ptr, gpu_index.0);
+                cuda_ext_memset_async(ptr, 0u64, size, stream_ptr, gpu_index.0);
            }
            streams.synchronize_one(i as u32);
            ptrs.push(ptr);
@@ -179,7 +179,7 @@ impl<T: Numeric> CudaVec<T> {
        let size = self.len() * std::mem::size_of::<T>();
        // We check that self is not empty to avoid invalid pointers
        if size > 0 {
-            cuda_memset_async(
+            cuda_ext_memset_async(
                self.as_mut_c_ptr(stream_index),
                value,
                size as u64,
@@ -209,7 +209,7 @@ impl<T: Numeric> CudaVec<T> {
        // We have to check that src is not empty, because Rust slice with size 0 results in an
        // invalid pointer being passed to copy_to_gpu_async
        if size > 0 {
-            cuda_memcpy_async_to_gpu(
+            cuda_ext_memcpy_async_to_gpu(
                self.as_mut_c_ptr(stream_index),
                src.as_ptr().cast(),
                size as u64,
@@ -237,7 +237,7 @@ impl<T: Numeric> CudaVec<T> {
            // invalid pointer being passed to copy_to_gpu_async
            if size > 0 {
                let gpu_index = streams.gpu_indexes[i];
-                cuda_memcpy_async_to_gpu(
+                cuda_ext_memcpy_async_to_gpu(
                    self.get_mut_c_ptr(i as u32),
                    src.as_ptr().cast(),
                    size as u64,
@@ -266,7 +266,7 @@ impl<T: Numeric> CudaVec<T> {
        let size = src.len() * std::mem::size_of::<T>();
        // We check that src is not empty to avoid invalid pointers
        if size > 0 {
-            cuda_memcpy_async_gpu_to_gpu(
+            cuda_ext_memcpy_async_gpu_to_gpu(
                self.as_mut_c_ptr(stream_index),
                src.as_c_ptr(stream_index),
                size as u64,
@@ -305,7 +305,7 @@ impl<T: Numeric> CudaVec<T> {
            .as_c_ptr(stream_index)
            .add(start * std::mem::size_of::<T>());
        let size = (end - start + 1) * std::mem::size_of::<T>();
-        cuda_memcpy_async_gpu_to_gpu(
+        cuda_ext_memcpy_async_gpu_to_gpu(
            self.as_mut_c_ptr(stream_index),
            src_ptr,
            size as u64,
@@ -342,7 +342,7 @@ impl<T: Numeric> CudaVec<T> {
            .as_mut_c_ptr(stream_index)
            .add(start * std::mem::size_of::<T>());
        let size = (end - start + 1) * std::mem::size_of::<T>();
-        cuda_memcpy_async_gpu_to_gpu(
+        cuda_ext_memcpy_async_gpu_to_gpu(
            dest_ptr,
            src.as_c_ptr(stream_index),
            size as u64,
@@ -366,7 +366,7 @@ impl<T: Numeric> CudaVec<T> {
        // We have to check that self is not empty, because Rust slice with size 0 results in an
        // invalid pointer being passed to copy_to_cpu_async
        if size > 0 {
-            cuda_memcpy_async_to_cpu(
+            cuda_ext_memcpy_async_to_cpu(
                dest.as_mut_ptr().cast(),
                self.as_c_ptr(stream_index),
                size as u64,
@@ -484,7 +484,7 @@ impl<T: Numeric> Drop for CudaVec<T> {
            // Synchronizes the device to be sure no stream is still using this pointer
            let gpu_index = self.gpu_indexes[i];
            synchronize_device(gpu_index.0);
-            unsafe { cuda_drop(ptr, gpu_index.0) };
+            unsafe { cuda_ext_drop(ptr, gpu_index.0) };
        }
    }
 }
--- a/tfhe/src/integer/gpu/ciphertext/compact_list.rs
+++ b/tfhe/src/integer/gpu/ciphertext/compact_list.rs
@@ -20,7 +20,7 @@ use crate::shortint::{AtomicPatternKind, CarryModulus, Ciphertext, MessageModulu
 use crate::GpuIndex;
 use itertools::Itertools;
 use serde::Deserializer;
-use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_to_gpu;
+use tfhe_cuda_backend::cuda_bind::cuda_ext_memcpy_async_to_gpu;

 #[derive(Clone)]
 pub struct CudaCompactCiphertextListInfo {
@@ -271,7 +271,7 @@ impl CudaFlattenedVecCompactCiphertextList {
                let dest_ptr = d_flattened_d_vec
                    .as_mut_c_ptr(0)
                    .add(offset * std::mem::size_of::<u64>());
-                cuda_memcpy_async_to_gpu(
+                cuda_ext_memcpy_async_to_gpu(
                    dest_ptr,
                    container.as_ptr().cast(),
                    (expected_length * std::mem::size_of::<u64>()) as u64,
--- a/tfhe/src/integer/gpu/list_compression/server_keys.rs
+++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs
@@ -15,7 +15,7 @@ use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphe
 use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    compress_integer_radix_async, cuda_memcpy_async_gpu_to_gpu, decompress_integer_radix_async_64,
+    compress_integer_radix_async, cuda_ext_memcpy_async_gpu_to_gpu, decompress_integer_radix_async_64,
    get_compression_size_on_gpu, get_decompression_size_on_gpu,
 };
 use crate::prelude::CastInto;
@@ -248,7 +248,7 @@ impl CudaCompressionKey {
                .as_mut_c_ptr(0)
                .add(offset * std::mem::size_of::<u64>());
            let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::<u64>();
-            cuda_memcpy_async_gpu_to_gpu(
+            cuda_ext_memcpy_async_gpu_to_gpu(
                dest_ptr,
                ciphertext.d_blocks.0.d_vec.as_c_ptr(0),
                size as u64,
@@ -577,7 +577,7 @@ impl CudaNoiseSquashingCompressionKey {
                .as_mut_c_ptr(0)
                .add(offset * std::mem::size_of::<u128>());
            let size = ciphertext.packed_d_blocks.0.d_vec.len * std::mem::size_of::<u128>();
-            cuda_memcpy_async_gpu_to_gpu(
+            cuda_ext_memcpy_async_gpu_to_gpu(
                dest_ptr,
                ciphertext.packed_d_blocks.0.d_vec.as_c_ptr(0),
                size as u64,