chore(gpu): add assert macro

2026-01-09 14:47:56 -05:00 · 2025-08-04 15:39:13 +02:00
parent 451458df97
commit 71f427de9e
12 changed files with 109 additions and 67 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -19,6 +19,11 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
    std::abort();
  }
 }
+
+// The PANIC macro should be used to validate user-inputs to GPU functions
+// it will execute in all targets, including production settings
+// e.g., cudaMemCopy to the device should check that the destination pointer is
+// a device pointer
 #define PANIC(format, ...)                                                     \
  {                                                                            \
    std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__,         \
@@ -26,6 +31,30 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
    std::abort();                                                              \
  }

+// This is a generic assertion checking macro with user defined printf-style
+// message
+#define PANIC_IF_FALSE(cond, format, ...)                                      \
+  do {                                                                         \
+    if (!(cond)) {                                                             \
+      PANIC(format "\n\n %s\n", ##__VA_ARGS__, #cond);                         \
+    }                                                                          \
+  } while (0)
+
+#ifndef GPU_ASSERTS_DISABLE
+// The GPU assert should be used to validate assumptions in algorithms,
+// for example, checking that two user-provided quantities have a certain
+// relationship or that the size of the buffer  provided to a function is
+// sufficient when it is filled with some algorithm that depends on
+// user-provided inputs e.g., OPRF corrections buffer should not have a size
+// higher than the number of blocks in the datatype that is generated
+#define GPU_ASSERT(cond, format, ...)                                          \
+  PANIC_IF_FALSE(cond, format, ##__VA_ARGS__)
+#else
+#define GPU_ASSERT(cond)                                                       \
+  do {                                                                         \
+  } while (0)
+#endif
+
 uint32_t cuda_get_device();
 void cuda_set_device(uint32_t gpu_index);

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
@@ -54,9 +54,11 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
                           int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
                           uint32_t polynomial_size, uint32_t level_count,
                           uint32_t max_shared_memory) {
-  if (gpu_count != 1)
-    PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
-          "supported yet.")
+  PANIC_IF_FALSE(gpu_count == 1,
+                 "GPU error (batch_fft_ggsw_vector): multi-GPU execution on %d "
+                 "gpus is not supported yet.",
+                 gpu_count);
+
  cuda_set_device(gpu_indexes[0]);

  int shared_memory_size = sizeof(double) * polynomial_size;
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -124,8 +124,10 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
                           num_blocks_per_sample, num_threads_x);

  int shared_mem = sizeof(Torus) * num_threads_y * num_threads_x;
-  if (num_blocks_per_sample > 65536)
-    PANIC("Cuda error (Keyswitch): number of blocks per sample is too large");
+  PANIC_IF_FALSE(
+      num_blocks_per_sample <= 65536,
+      "Cuda error (Keyswitch): number of blocks per sample (%d) is too large",
+      num_blocks_per_sample);

  // In multiplication of large integers (512, 1024, 2048), the number of
  // samples can be larger than 65536, so we need to set it in the first
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
@@ -204,8 +204,9 @@ __host__ void host_packing_keyswitch_lwe_list_to_glwe(

  // Shared memory requirement is 8192 bytes for 64-bit Torus elements
  uint32_t shared_mem_size = get_shared_mem_size_tgemm<Torus>();
-  if (shared_mem_size > 8192)
-    PANIC("GEMM kernel error: shared memory required might be too large");
+  // Sanity check: the shared memory size is a constant defined by the algorithm
+  GPU_ASSERT(shared_mem_size <= 8192,
+             "GEMM kernel error: shared memory required might be too large");

  tgemm<Torus><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
      num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0, fp_ksk_array,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -298,15 +298,14 @@ __host__ void host_improve_noise_modulus_switch(
    const double input_variance, const double r_sigma, const double bound,
    uint32_t log_modulus) {

-  if (lwe_size < 512) {
-    PANIC("The lwe_size is less than 512, this is not supported\n");
-    return;
-  }
+  PANIC_IF_FALSE(lwe_size >= 512,
+                 "The lwe_size (%d) is less than 512, this is not supported\n",
+                 lwe_size);
+  PANIC_IF_FALSE(
+      lwe_size <= 1024,
+      "The lwe_size (%d) is greater than 1024, this is not supported\n",
+      lwe_size);

-  if (lwe_size > 1024) {
-    PANIC("The lwe_size is greater than 1024, this is not supported\n");
-    return;
-  }
  cuda_set_device(gpu_index);

  // This reduction requires a power of two num of threads
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -196,14 +196,14 @@ void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
    return;
  cudaPointerAttributes attr_dest;
  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  if (attr_dest.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
-  }
+  PANIC_IF_FALSE(
+      attr_dest.type == cudaMemoryTypeDevice,
+      "Cuda error: invalid dest device pointer in copy from GPU to GPU.");
  cudaPointerAttributes attr_src;
  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  if (attr_src.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
-  }
+  PANIC_IF_FALSE(
+      attr_src.type == cudaMemoryTypeDevice,
+      "Cuda error: invalid src device pointer in copy from GPU to GPU.");
  cuda_set_device(gpu_index);
  if (attr_src.device == attr_dest.device) {
    check_cuda_error(
@@ -227,14 +227,14 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
    return;
  cudaPointerAttributes attr_dest;
  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  if (attr_dest.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
-  }
+  PANIC_IF_FALSE(
+      attr_dest.type == cudaMemoryTypeDevice,
+      "Cuda error: invalid dest device pointer in copy from GPU to GPU.");
  cudaPointerAttributes attr_src;
  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  if (attr_src.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
-  }
+  PANIC_IF_FALSE(
+      attr_src.type == cudaMemoryTypeDevice,
+      "Cuda error: invalid src device pointer in copy from GPU to GPU.");
  cuda_set_device(gpu_index);
  if (attr_src.device == attr_dest.device) {
    check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -20,12 +20,15 @@ __host__ void host_integer_radix_bitop_kb(
    void *const *bsks, Torus *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

-  if (lwe_array_out->num_radix_blocks != lwe_array_1->num_radix_blocks ||
-      lwe_array_out->num_radix_blocks != lwe_array_2->num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be equal")
-  if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
-      lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
-    PANIC("Cuda error: input and output lwe dimension must be equal")
+  PANIC_IF_FALSE(
+      lwe_array_out->num_radix_blocks == lwe_array_1->num_radix_blocks &&
+          lwe_array_out->num_radix_blocks == lwe_array_2->num_radix_blocks,
+      "Cuda error: input and output num radix blocks must be equal");
+
+  PANIC_IF_FALSE(lwe_array_out->lwe_dimension == lwe_array_1->lwe_dimension &&
+                     lwe_array_out->lwe_dimension == lwe_array_2->lwe_dimension,
+                 "Cuda error: input and output lwe dimension must be equal");
+
  auto lut = mem_ptr->lut;
  uint64_t degrees[lwe_array_1->num_radix_blocks];
  if (mem_ptr->op == BITOP_TYPE::BITAND) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
@@ -25,10 +25,10 @@ __host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
  const uint32_t input_start_lwe_index =
      input->num_radix_blocks - output->num_radix_blocks;

-  if (input->num_radix_blocks <= output->num_radix_blocks) {
-    PANIC("Cuda error: input num blocks should be greater than output num "
-          "blocks");
-  }
+  PANIC_IF_FALSE(input->num_radix_blocks > output->num_radix_blocks,
+                 "Cuda error: input num blocks (%d) should be greater than "
+                 "output num blocks (%d)",
+                 input->num_radix_blocks, output->num_radix_blocks);

  copy_radix_ciphertext_slice_async<Torus>(
      streams[0], gpu_indexes[0], output, 0, output->num_radix_blocks, input,
@@ -70,9 +70,7 @@ __host__ void host_extend_radix_with_sign_msb(
  PUSH_RANGE("cast/extend")
  const uint32_t input_blocks = input->num_radix_blocks;

-  if (input_blocks == 0) {
-    PANIC("Cuda error: input blocks cannot be zero");
-  }
+  PANIC_IF_FALSE(input_blocks > 0, "Cuda error: input blocks cannot be zero");

  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
                                           0, input_blocks, input, 0,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -15,14 +15,18 @@ zero_out_if(cudaStream_t const *streams, uint32_t const *gpu_indexes,
            Torus *const *ksks,
            CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
            uint32_t num_radix_blocks) {
-  if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
-      lwe_array_input->num_radix_blocks < num_radix_blocks)
-    PANIC("Cuda error: input or output radix ciphertexts does not have enough "
-          "blocks")
-  if (lwe_array_out->lwe_dimension != lwe_array_input->lwe_dimension ||
-      lwe_array_input->lwe_dimension != lwe_condition->lwe_dimension)
-    PANIC("Cuda error: input and output radix ciphertexts must have the same "
-          "lwe dimension")
+  PANIC_IF_FALSE(
+      lwe_array_out->num_radix_blocks >= num_radix_blocks &&
+          lwe_array_input->num_radix_blocks >= num_radix_blocks,
+      "Cuda error: input or output radix ciphertexts does not have enough "
+      "blocks");
+
+  PANIC_IF_FALSE(
+      lwe_array_out->lwe_dimension == lwe_array_input->lwe_dimension &&
+          lwe_array_input->lwe_dimension == lwe_condition->lwe_dimension,
+      "Cuda error: input and output radix ciphertexts must have the same "
+      "lwe dimension");
+
  cuda_set_device(gpu_indexes[0]);
  auto params = mem_ptr->params;

--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -101,17 +101,20 @@ __host__ void host_radix_blocks_rotate_right(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *dst,
    CudaRadixCiphertextFFI *src, uint32_t rotations, uint32_t num_blocks) {
-  if (src == dst) {
-    PANIC("Cuda error (blocks_rotate_right): the source and destination "
-          "pointers should be different");
-  }
-  if (dst->lwe_dimension != src->lwe_dimension)
-    PANIC("Cuda error: input and output should have the same "
-          "lwe dimension")
-  if (dst->num_radix_blocks < num_blocks || src->num_radix_blocks < num_blocks)
-    PANIC("Cuda error: input and output should have more blocks than asked for "
-          "in the "
-          "function call")
+  PANIC_IF_FALSE(src != dst,
+                 "Cuda error (blocks_rotate_right): the source and destination "
+                 "pointers should be different");
+
+  PANIC_IF_FALSE(dst->lwe_dimension == src->lwe_dimension,
+                 "Cuda error: input and output should have the same "
+                 "lwe dimension");
+
+  PANIC_IF_FALSE(
+      dst->num_radix_blocks >= num_blocks &&
+          src->num_radix_blocks >= num_blocks,
+      "Cuda error: input and output should have more blocks than asked for "
+      "in the "
+      "function call");

  auto lwe_size = src->lwe_dimension + 1;

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -229,8 +229,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
    uint32_t lut_stride) {

-  if (base_log > 64)
-    PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
+  PANIC_IF_FALSE(base_log <= 64,
+                 "Cuda error (multi-bit PBS): base log (%d) should be <= 64",
+                 base_log);

  pbs_buffer<uint64_t, MULTI_BIT> *buffer =
      (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
@@ -31,13 +31,13 @@ __host__ void host_expand_without_verification(
                            : mem_ptr->casting_params.small_lwe_dimension);
  auto d_lwe_compact_input_indexes = mem_ptr->d_lwe_compact_input_indexes;
  auto d_body_id_per_compact_list = mem_ptr->d_body_id_per_compact_list;
-  if (sizeof(Torus) == 8) {
-    cuda_lwe_expand_64(streams[0], gpu_indexes[0], expanded_lwes,
-                       lwe_flattened_compact_array_in, lwe_dimension, num_lwes,
-                       d_lwe_compact_input_indexes, d_body_id_per_compact_list);

-  } else
-    PANIC("Cuda error: expand is only supported on 64 bits")
+  GPU_ASSERT(sizeof(Torus) == 8,
+             "Cuda error: expand is only supported on 64 bits");
+
+  cuda_lwe_expand_64(streams[0], gpu_indexes[0], expanded_lwes,
+                     lwe_flattened_compact_array_in, lwe_dimension, num_lwes,
+                     d_lwe_compact_input_indexes, d_body_id_per_compact_list);

  auto ksks = casting_keys;
  auto lwe_array_input = expanded_lwes;