feat(cuda): introduce cuda acceleration for the pbs and keyswitch

- a new crate concrete-cuda is added to the repository, containing some Cuda implementations for the bootstrap and keyswitch and a Rust wrapping to call them - a new backend_cuda is added to concrete-core, with dedicated entities whose memory is located on the GPU and engines that call the Cuda accelerated functions
2026-02-17 08:01:20 -05:00 · 2021-10-20 09:34:57 +02:00
commit 64521f6747
37 changed files with 15143 additions and 0 deletions
--- a/src/utils/kernel_dimensions.cuh
+++ b/src/utils/kernel_dimensions.cuh
@@ -0,0 +1,15 @@
+int nextPow2(int x) {
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return ++x;
+}
+
+void getNumBlocksAndThreads(const int n, const int maxBlockSize, int &blocks,
+                            int &threads) {
+  threads = (n < maxBlockSize * 2) ? nextPow2((n + 1) / 2) : maxBlockSize;
+  blocks = (n + threads - 1) / threads;
+}
--- a/src/utils/memory.cuh
+++ b/src/utils/memory.cuh
@@ -0,0 +1,90 @@
+#ifndef CNCRT_SHMEM_H
+#define CNCRT_SHMEM_H
+
+#include "helper_cuda.h"
+#include <atomic>
+#include <iostream>
+#include <mutex>
+#include <thread>
+#include <tuple>
+#include <vector>
+
+class SharedMemory {
+public:
+  char *m_memory_block;
+  int m_last_byte;
+
+  __device__ SharedMemory(char *ptr) : m_memory_block(ptr), m_last_byte(0) {}
+
+  template <typename T> __device__ void get_allocation(T **ptr, int elements) {
+    *ptr = (T *)(&this->m_memory_block[m_last_byte]);
+    this->m_last_byte += elements * sizeof(T);
+  }
+};
+
+class DeviceMemory {
+public:
+  std::vector<std::tuple<void *, int>> m_allocated;
+  std::mutex m_allocation_mtx;
+  std::atomic<uint32_t> m_total_devices;
+
+  DeviceMemory() : m_total_devices(1) {}
+
+  __host__ void set_device(int device) {
+    if (device > m_total_devices)
+      m_total_devices = device + 1;
+  }
+
+  template <typename T>
+  __host__ void get_allocation(T **ptr, int elements, int device) {
+    T *res;
+    cudaMalloc((void **)&res, sizeof(T) * elements);
+    *ptr = res;
+    std::lock_guard<std::mutex> lock(m_allocation_mtx);
+    m_allocated.push_back(std::make_tuple(res, device));
+  }
+
+  template <typename T>
+  __host__ void get_allocation_and_copy_async(T **ptr, T *src, int elements,
+                                              int device) {
+    T *res;
+    cudaMalloc((void **)&res, sizeof(T) * elements);
+    cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
+    *ptr = res;
+    std::lock_guard<std::mutex> lock(m_allocation_mtx);
+    m_allocated.push_back(std::make_tuple(res, device));
+  }
+
+  template <typename T>
+  __host__ void get_allocation_and_copy_async(T **ptr, T *src, int allocation,
+                                              int elements, int device) {
+    T *res;
+    cudaMalloc((void **)&res, sizeof(T) * allocation);
+    cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
+    *ptr = res;
+    std::lock_guard<std::mutex> lock(m_allocation_mtx);
+    m_allocated.push_back(std::make_tuple(res, device));
+  }
+
+  void free_all_from_device(int device) {
+    cudaSetDevice(device);
+    for (auto elem : m_allocated) {
+      auto dev = std::get<1>(elem);
+      if (dev == device) {
+        auto mem = std::get<0>(elem);
+        checkCudaErrors(cudaFree(mem));
+      }
+    }
+  }
+
+  __host__ ~DeviceMemory() {
+    for (auto elem : m_allocated) {
+      auto dev = std::get<1>(elem);
+      auto mem = std::get<0>(elem);
+      cudaSetDevice(dev);
+      checkCudaErrors(cudaFree(mem));
+    }
+  }
+};
+
+#endif // CNCRT_SHMEM_H
--- a/src/utils/timer.cuh
+++ b/src/utils/timer.cuh
@@ -0,0 +1,29 @@
+#ifndef CNCRT_TIMER_H
+#define CNCRT_TIMER_H
+
+#define synchronize_threads_in_block() __syncthreads()
+
+template <bool active> class CudaMeasureExecution {
+public:
+  cudaEvent_t m_start, m_stop;
+
+  __host__ CudaMeasureExecution() {
+    if constexpr (active) {
+      cudaEventCreate(&m_start);
+      cudaEventCreate(&m_stop);
+      cudaEventRecord(m_start);
+    }
+  }
+
+  __host__ ~CudaMeasureExecution() {
+    if constexpr (active) {
+      float ms;
+      cudaEventRecord(m_stop);
+      cudaEventSynchronize(m_stop);
+      cudaEventElapsedTime(&ms, m_start, m_stop);
+      std::cout << "Execution took " << ms << "ms" << std::endl;
+    }
+  }
+};
+
+#endif // CNCRT_TIMER_H