refactor(gpu): cleaning compression

2026-01-10 07:08:03 -05:00 · 2025-07-30 16:50:51 +02:00
parent 46a7229c81
commit d1c417bf71
10 changed files with 295 additions and 237 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -3,6 +3,26 @@

 #include "../../pbs/pbs_enums.h"

+typedef struct {
+  void *ptr;
+  uint32_t num_radix_blocks;
+  uint32_t lwe_dimension;
+} CudaLweCiphertextListFFI;
+
+typedef struct {
+  void *ptr;
+  uint32_t storage_log_modulus;
+  uint32_t lwe_per_glwe;
+  // Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
+  // smaller)
+  // Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
+  // each LWE of the group). In the end the total number of bodies is equal to
+  // the number of input LWE
+  uint32_t total_lwe_bodies_count;
+  uint32_t glwe_dimension;
+  uint32_t polynomial_size;
+} CudaPackedGlweCiphertextListFFI;
+
 extern "C" {
 uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -10,28 +30,29 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
    uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
-    bool allocate_gpu_memory);
+    uint32_t lwe_per_glwe, bool allocate_gpu_memory);

 uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
    uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t storage_log_modulus, uint32_t body_count, bool allocate_gpu_memory,
+    uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t num_blocks_to_decompress, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
    bool allocate_ms_array);

 void cuda_integer_compress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk,
-    uint32_t num_nths, int8_t *mem_ptr);
+    CudaPackedGlweCiphertextListFFI *glwe_array_out,
+    CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
+    int8_t *mem_ptr);

 void cuda_integer_decompress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array,
-    uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr);
+    CudaLweCiphertextListFFI *lwe_array_out,
+    CudaPackedGlweCiphertextListFFI const *glwe_in,
+    uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr);

 void cleanup_cuda_integer_compress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -5,11 +5,6 @@

 template <typename Torus> struct int_compression {
  int_radix_params compression_params;
-  uint32_t storage_log_modulus;
-  uint32_t lwe_per_glwe;
-
-  uint32_t body_count;
-
  // Compression
  int8_t *fp_ks_buffer;
  Torus *tmp_lwe;
@@ -19,13 +14,9 @@ template <typename Torus> struct int_compression {
  int_compression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                  uint32_t gpu_count, int_radix_params compression_params,
                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
-                  uint32_t storage_log_modulus, bool allocate_gpu_memory,
-                  uint64_t &size_tracker) {
+                  bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->compression_params = compression_params;
-    this->lwe_per_glwe = lwe_per_glwe;
-    this->storage_log_modulus = storage_log_modulus;
-    this->body_count = num_radix_blocks;

    uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
                                     compression_params.polynomial_size;
@@ -58,11 +49,7 @@ template <typename Torus> struct int_compression {
 template <typename Torus> struct int_decompression {
  int_radix_params encryption_params;
  int_radix_params compression_params;
-
-  uint32_t storage_log_modulus;
-
-  uint32_t num_radix_blocks;
-  uint32_t body_count;
+  uint32_t num_blocks_to_decompress;

  Torus *tmp_extracted_glwe;
  Torus *tmp_extracted_lwe;
@@ -74,15 +61,12 @@ template <typename Torus> struct int_decompression {
  int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                    uint32_t gpu_count, int_radix_params encryption_params,
                    int_radix_params compression_params,
-                    uint32_t num_radix_blocks, uint32_t body_count,
-                    uint32_t storage_log_modulus, bool allocate_gpu_memory,
+                    uint32_t num_blocks_to_decompress, bool allocate_gpu_memory,
                    uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->encryption_params = encryption_params;
    this->compression_params = compression_params;
-    this->storage_log_modulus = storage_log_modulus;
-    this->num_radix_blocks = num_radix_blocks;
-    this->body_count = body_count;
+    this->num_blocks_to_decompress = num_blocks_to_decompress;

    uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
                                     compression_params.polynomial_size;
@@ -90,18 +74,18 @@ template <typename Torus> struct int_decompression {
                                         compression_params.polynomial_size +
                                     1);
    decompression_rescale_lut = new int_radix_lut<Torus>(
-        streams, gpu_indexes, gpu_count, encryption_params, 1, num_radix_blocks,
-        allocate_gpu_memory, size_tracker);
+        streams, gpu_indexes, gpu_count, encryption_params, 1,
+        num_blocks_to_decompress, allocate_gpu_memory, size_tracker);

    tmp_extracted_glwe = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * glwe_accumulator_size * sizeof(Torus), streams[0],
-        gpu_indexes[0], size_tracker, allocate_gpu_memory);
+        num_blocks_to_decompress * glwe_accumulator_size * sizeof(Torus),
+        streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
    tmp_indexes_array = (uint32_t *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * sizeof(uint32_t), streams[0], gpu_indexes[0],
+        num_blocks_to_decompress * sizeof(uint32_t), streams[0], gpu_indexes[0],
        size_tracker, allocate_gpu_memory);
    tmp_extracted_lwe = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0],
-        gpu_indexes[0], size_tracker, allocate_gpu_memory);
+        num_blocks_to_decompress * lwe_accumulator_size * sizeof(Torus),
+        streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);

    // Rescale is done using an identity LUT
    // Here we do not divide by message_modulus
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -123,7 +123,6 @@ __host__ void host_modulus_switch_inplace(cudaStream_t stream,

  int num_threads = 0, num_blocks = 0;
  getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
-
  modulus_switch_inplace<Torus>
      <<<num_blocks, num_threads, 0, stream>>>(array, size, log_modulus);
  check_cuda_error(cudaGetLastError());
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -6,8 +6,7 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
    uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
-    bool allocate_gpu_memory) {
+    uint32_t lwe_per_glwe, bool allocate_gpu_memory) {

  int_radix_params compression_params(
      pbs_type, compression_glwe_dimension, compression_polynomial_size,
@@ -18,17 +17,16 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
  return scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_compression<uint64_t> **)mem_ptr, num_radix_blocks,
-      compression_params, lwe_per_glwe, storage_log_modulus,
-      allocate_gpu_memory);
+      compression_params, lwe_per_glwe, allocate_gpu_memory);
 }
 uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
    uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t storage_log_modulus, uint32_t body_count, bool allocate_gpu_memory,
+    uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t num_blocks_to_decompress, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
    bool allocate_ms_array) {

  // Decompression doesn't keyswitch, so big and small dimensions are the same
@@ -45,31 +43,29 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(

  return scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_decompression<uint64_t> **)mem_ptr, num_radix_blocks, body_count,
-      encryption_params, compression_params, storage_log_modulus,
-      allocate_gpu_memory);
+      (int_decompression<uint64_t> **)mem_ptr, num_blocks_to_decompress,
+      encryption_params, compression_params, allocate_gpu_memory);
 }
 void cuda_integer_compress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk,
-    uint32_t num_nths, int8_t *mem_ptr) {
+    CudaPackedGlweCiphertextListFFI *glwe_array_out,
+    CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
+    int8_t *mem_ptr) {

-  host_integer_compress<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(glwe_array_out),
-      static_cast<const uint64_t *>(lwe_array_in), (uint64_t *const *)(fp_ksk),
-      num_nths, (int_compression<uint64_t> *)mem_ptr);
+  host_integer_compress<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
+                                  gpu_count, glwe_array_out, lwe_array_in,
+                                  (uint64_t *const *)(fp_ksk),
+                                  (int_compression<uint64_t> *)mem_ptr);
 }
 void cuda_integer_decompress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array,
-    uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr) {
+    CudaLweCiphertextListFFI *lwe_array_out,
+    CudaPackedGlweCiphertextListFFI const *glwe_in,
+    uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr) {

  host_integer_decompress<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<const uint64_t *>(glwe_in), indexes_array, indexes_array_size,
-      bsks, (int_decompression<uint64_t> *)mem_ptr);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out, glwe_in,
+      indexes_array, bsks, (int_decompression<uint64_t> *)mem_ptr);
 }

 void cleanup_cuda_integer_compress_radix_ciphertext_64(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -48,19 +48,19 @@ __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
 /// implementation
 template <typename Torus>
 __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
-                        Torus *array_out, Torus *array_in, uint32_t num_glwes,
-                        uint32_t num_lwes, int_compression<Torus> *mem_ptr) {
-  if (array_in == array_out)
+                        CudaPackedGlweCiphertextListFFI *array_out,
+                        Torus *array_in, uint32_t num_glwes,
+                        int_radix_params compression_params) {
+  if (array_in == (Torus *)array_out->ptr)
    PANIC("Cuda error: Input and output must be different");

  cuda_set_device(gpu_index);
-  auto compression_params = mem_ptr->compression_params;

-  auto log_modulus = mem_ptr->storage_log_modulus;
+  auto log_modulus = array_out->storage_log_modulus;
  // [0..num_glwes-1) GLWEs
  auto in_len = num_glwes * compression_params.glwe_dimension *
                    compression_params.polynomial_size +
-                num_lwes;
+                array_out->total_lwe_bodies_count;

  auto number_bits_to_pack = in_len * log_modulus;

@@ -73,18 +73,18 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,

  dim3 grid(num_blocks);
  dim3 threads(num_threads);
-  pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
-                                            out_len, in_len, out_len);
+  pack<Torus><<<grid, threads, 0, stream>>>(
+      (Torus *)array_out->ptr, array_in, log_modulus, out_len, in_len, out_len);
  check_cuda_error(cudaGetLastError());
 }

 template <typename Torus>
 __host__ void
 host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                      uint32_t gpu_count, Torus *glwe_array_out,
-                      Torus const *lwe_array_in, Torus *const *fp_ksk,
-                      uint32_t num_radix_blocks,
-                      int_compression<Torus> *mem_ptr) {
+                      uint32_t gpu_count,
+                      CudaPackedGlweCiphertextListFFI *glwe_array_out,
+                      CudaLweCiphertextListFFI const *lwe_array_in,
+                      Torus *const *fp_ksk, int_compression<Torus> *mem_ptr) {

  auto compression_params = mem_ptr->compression_params;
  auto input_lwe_dimension = compression_params.small_lwe_dimension;
@@ -93,14 +93,14 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
  auto lwe_shifted = mem_ptr->tmp_lwe;
  host_cleartext_multiplication<Torus>(
      streams[0], gpu_indexes[0], lwe_shifted, lwe_array_in,
-      (uint64_t)compression_params.message_modulus, input_lwe_dimension,
-      num_radix_blocks);
+      (uint64_t)compression_params.message_modulus);

  uint32_t lwe_in_size = input_lwe_dimension + 1;
  uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
                           compression_params.polynomial_size;
-  uint32_t num_glwes =
-      (num_radix_blocks + mem_ptr->lwe_per_glwe - 1) / mem_ptr->lwe_per_glwe;
+  uint32_t num_glwes = (glwe_array_out->total_lwe_bodies_count +
+                        glwe_array_out->lwe_per_glwe - 1) /
+                       glwe_array_out->lwe_per_glwe;

  // Keyswitch LWEs to GLWE
  auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
@@ -109,12 +109,13 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                        compression_params.polynomial_size * sizeof(Torus),
                    streams[0], gpu_indexes[0]);
  auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
-  auto rem_lwes = num_radix_blocks;
+  auto rem_lwes = glwe_array_out->total_lwe_bodies_count;

  auto lwe_subset = lwe_shifted;
  auto glwe_out = tmp_glwe_array_out;
+
  while (rem_lwes > 0) {
-    auto chunk_size = min(rem_lwes, mem_ptr->lwe_per_glwe);
+    auto chunk_size = min(rem_lwes, glwe_array_out->lwe_per_glwe);

    host_packing_keyswitch_lwe_list_to_glwe<Torus>(
        streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
@@ -128,15 +129,16 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
  }

  // Modulus switch
-  host_modulus_switch_inplace<Torus>(
-      streams[0], gpu_indexes[0], tmp_glwe_array_out,
-      num_glwes * compression_params.glwe_dimension *
-              compression_params.polynomial_size +
-          num_radix_blocks,
-      mem_ptr->storage_log_modulus);
+  int size = num_glwes * compression_params.glwe_dimension *
+                 compression_params.polynomial_size +
+             glwe_array_out->total_lwe_bodies_count;
+
+  host_modulus_switch_inplace<Torus>(streams[0], gpu_indexes[0],
+                                     tmp_glwe_array_out, size,
+                                     glwe_array_out->storage_log_modulus);

  host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
-                   tmp_glwe_array_out, num_glwes, num_radix_blocks, mem_ptr);
+                   tmp_glwe_array_out, num_glwes, compression_params);
 }

 template <typename Torus>
@@ -176,34 +178,38 @@ __global__ void extract(Torus *glwe_array_out, Torus const *array_in,
 /// This function follows the naming used in the CPU implementation
 template <typename Torus>
 __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
-                           Torus *glwe_array_out, Torus const *array_in,
-                           uint32_t glwe_index,
-                           int_decompression<Torus> *mem_ptr) {
-  if (array_in == glwe_array_out)
+                           Torus *glwe_array_out,
+                           CudaPackedGlweCiphertextListFFI const *array_in,
+                           uint32_t glwe_index) {
+  if ((Torus *)array_in->ptr == glwe_array_out)
    PANIC("Cuda error: Input and output must be different");

  cuda_set_device(gpu_index);

-  auto compression_params = mem_ptr->compression_params;
-  auto log_modulus = mem_ptr->storage_log_modulus;
-  auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) *
-                              compression_params.polynomial_size;
+  auto log_modulus = array_in->storage_log_modulus;
+  auto total_lwe_bodies_count = array_in->total_lwe_bodies_count;
+  auto polynomial_size = array_in->polynomial_size;
+  auto glwe_dimension = array_in->glwe_dimension;

-  uint32_t body_count = mem_ptr->body_count;
-  auto num_glwes = (body_count + compression_params.polynomial_size - 1) /
-                   compression_params.polynomial_size;
+  auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;
+
+  uint32_t num_glwes =
+      (total_lwe_bodies_count + polynomial_size - 1) / polynomial_size;

  // Compressed length of the compressed GLWE we want to extract
-  if (mem_ptr->body_count % compression_params.polynomial_size == 0)
-    body_count = compression_params.polynomial_size;
-  else if (glwe_index == num_glwes - 1)
-    body_count = mem_ptr->body_count % compression_params.polynomial_size;
-  else
-    body_count = compression_params.polynomial_size;
+  uint32_t body_count = 0;
+  if (glwe_index == num_glwes - 1) {
+    auto remainder = total_lwe_bodies_count % polynomial_size;
+    if (remainder == 0) {
+      body_count = polynomial_size;
+    } else {
+      body_count = remainder;
+    }
+  } else {
+    body_count = polynomial_size;
+  }

-  auto initial_out_len =
-      compression_params.glwe_dimension * compression_params.polynomial_size +
-      body_count;
+  uint32_t initial_out_len = glwe_dimension * polynomial_size + body_count;

  // Calculates how many bits this particular GLWE shall use
  auto number_bits_to_unpack = initial_out_len * log_modulus;
@@ -213,7 +219,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
  number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
  auto len = (number_bits_to_unpack + nbits - 1) / nbits;
  // Uses that length to set the input pointer
-  auto chunk_array_in = array_in + glwe_index * len;
+  auto chunk_array_in = (Torus *)array_in->ptr + glwe_index * len;

  // Ensure the tail of the GLWE is zeroed
  if (initial_out_len < glwe_ciphertext_size) {
@@ -232,25 +238,24 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
 }

 template <typename Torus>
-__host__ void host_integer_decompress(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *d_lwe_array_out, Torus const *d_packed_glwe_in,
-    uint32_t const *h_indexes_array, uint32_t indexes_array_size,
-    void *const *d_bsks, int_decompression<Torus> *h_mem_ptr) {
+__host__ void
+host_integer_decompress(cudaStream_t const *streams,
+                        uint32_t const *gpu_indexes, uint32_t gpu_count,
+                        CudaLweCiphertextListFFI *d_lwe_array_out,
+                        CudaPackedGlweCiphertextListFFI const *d_packed_glwe_in,
+                        uint32_t const *h_indexes_array, void *const *d_bsks,
+                        int_decompression<Torus> *h_mem_ptr) {
+
+  auto num_blocks_to_decompress = h_mem_ptr->num_blocks_to_decompress;

  auto d_indexes_array = h_mem_ptr->tmp_indexes_array;
  cuda_memcpy_async_to_gpu(d_indexes_array, (void *)h_indexes_array,
-                           indexes_array_size * sizeof(uint32_t), streams[0],
-                           gpu_indexes[0]);
+                           num_blocks_to_decompress * sizeof(uint32_t),
+                           streams[0], gpu_indexes[0]);

  auto compression_params = h_mem_ptr->compression_params;
  auto lwe_per_glwe = compression_params.polynomial_size;

-  auto num_radix_blocks = h_mem_ptr->num_radix_blocks;
-  if (num_radix_blocks != indexes_array_size)
-    PANIC("Cuda error: wrong number of LWEs in decompress: the number of LWEs "
-          "should be the same as indexes_array_size.")
-
  // the first element is the number of LWEs that lies in the related GLWE
  std::vector<std::pair<int, Torus *>> glwe_vec;

@@ -261,16 +266,16 @@ __host__ void host_integer_decompress(
  auto current_glwe_index = h_indexes_array[0] / lwe_per_glwe;
  auto extracted_glwe = h_mem_ptr->tmp_extracted_glwe;
  host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
-                      d_packed_glwe_in, current_glwe_index, h_mem_ptr);
+                      d_packed_glwe_in, current_glwe_index);
  glwe_vec.push_back(std::make_pair(1, extracted_glwe));
-  for (int i = 1; i < indexes_array_size; i++) {
+  for (int i = 1; i < num_blocks_to_decompress; i++) {
    auto glwe_index = h_indexes_array[i] / lwe_per_glwe;
    if (glwe_index != current_glwe_index) {
      extracted_glwe += glwe_accumulator_size;
      current_glwe_index = glwe_index;
      // Extracts a new GLWE
      host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
-                          d_packed_glwe_in, glwe_index, h_mem_ptr);
+                          d_packed_glwe_in, glwe_index);
      glwe_vec.push_back(std::make_pair(1, extracted_glwe));
    } else {
      // Updates the quantity
@@ -306,17 +311,19 @@ __host__ void host_integer_decompress(
  /// dimension to a big LWE dimension
  auto encryption_params = h_mem_ptr->encryption_params;
  auto lut = h_mem_ptr->decompression_rescale_lut;
-  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  auto active_gpu_count =
+      get_active_gpu_count(num_blocks_to_decompress, gpu_count);
  if (active_gpu_count == 1) {
    execute_pbs_async<Torus>(
-        streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
+        streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
        lut->lwe_indexes_in, d_bsks, nullptr, lut->buffer,
        encryption_params.glwe_dimension,
        compression_params.small_lwe_dimension,
        encryption_params.polynomial_size, encryption_params.pbs_base_log,
        encryption_params.pbs_level, encryption_params.grouping_factor,
-        num_radix_blocks, encryption_params.pbs_type, num_many_lut, lut_stride);
+        num_blocks_to_decompress, encryption_params.pbs_type, num_many_lut,
+        lut_stride);
  } else {
    /// For multi GPU execution we create vectors of pointers for inputs and
    /// outputs
@@ -332,7 +339,7 @@ __host__ void host_integer_decompress(
    multi_gpu_scatter_lwe_async<Torus>(
        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        lut->active_gpu_count, num_radix_blocks,
+        lut->active_gpu_count, num_blocks_to_decompress,
        compression_params.small_lwe_dimension + 1);

    /// Apply PBS
@@ -344,13 +351,14 @@ __host__ void host_integer_decompress(
        compression_params.small_lwe_dimension,
        encryption_params.polynomial_size, encryption_params.pbs_base_log,
        encryption_params.pbs_level, encryption_params.grouping_factor,
-        num_radix_blocks, encryption_params.pbs_type, num_many_lut, lut_stride);
+        num_blocks_to_decompress, encryption_params.pbs_type, num_many_lut,
+        lut_stride);

    /// Copy data back to GPU 0 and release vecs
    multi_gpu_gather_lwe_async<Torus>(
-        streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
+        streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
        lwe_after_pbs_vec, lut->h_lwe_indexes_out,
-        lut->using_trivial_lwe_indexes, num_radix_blocks,
+        lut->using_trivial_lwe_indexes, num_blocks_to_decompress,
        encryption_params.big_lwe_dimension + 1);

    /// Synchronize all GPUs
@@ -365,13 +373,12 @@ __host__ uint64_t scratch_cuda_compress_integer_radix_ciphertext(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, int_compression<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params compression_params,
-    uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
-    bool allocate_gpu_memory) {
+    uint32_t lwe_per_glwe, bool allocate_gpu_memory) {

  uint64_t size_tracker = 0;
  *mem_ptr = new int_compression<Torus>(
      streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
-      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, size_tracker);
+      lwe_per_glwe, allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

@@ -379,15 +386,13 @@ template <typename Torus>
 __host__ uint64_t scratch_cuda_integer_decompress_radix_ciphertext(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, int_decompression<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, uint32_t body_count,
-    int_radix_params encryption_params, int_radix_params compression_params,
-    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+    uint32_t num_blocks_to_decompress, int_radix_params encryption_params,
+    int_radix_params compression_params, bool allocate_gpu_memory) {

  uint64_t size_tracker = 0;
  *mem_ptr = new int_decompression<Torus>(
      streams, gpu_indexes, gpu_count, encryption_params, compression_params,
-      num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory,
-      size_tracker);
+      num_blocks_to_decompress, allocate_gpu_memory, size_tracker);
  return size_tracker;
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -13,6 +13,8 @@
 #include <iostream>
 #include <vector>

+#include "integer/compression/compression.h"
+
 template <typename T>
 __global__ void cleartext_vec_multiplication(T *output, T const *lwe_input,
                                             T const *cleartext_input,
@@ -51,9 +53,9 @@ __host__ void host_cleartext_vec_multiplication(
 }

 template <typename T>
-__global__ void
-cleartext_multiplication(T *output, T const *lwe_input, T cleartext_input,
-                         uint32_t input_lwe_dimension, uint32_t num_entries) {
+__global__ void cleartext_multiplication(T *output, T const *lwe_input,
+                                         T cleartext_input,
+                                         const uint32_t num_entries) {

  int tid = threadIdx.x;
  int index = blockIdx.x * blockDim.x + tid;
@@ -64,25 +66,21 @@ cleartext_multiplication(T *output, T const *lwe_input, T cleartext_input,
 }

 template <typename T>
-__host__ void
-host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
-                              T *output, T const *lwe_input, T cleartext_input,
-                              uint32_t input_lwe_dimension,
-                              uint32_t input_lwe_ciphertext_count) {
+__host__ void host_cleartext_multiplication(
+    cudaStream_t stream, uint32_t gpu_index, T *output,
+    CudaLweCiphertextListFFI const *lwe_input, T cleartext_input) {

  cuda_set_device(gpu_index);
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  int lwe_size = input_lwe_dimension + 1;
  // Create a 1-dimensional grid of threads
  int num_blocks = 0, num_threads = 0;
-  int num_entries = input_lwe_ciphertext_count * lwe_size;
+  uint32_t num_entries =
+      lwe_input->num_radix_blocks * (lwe_input->lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

  cleartext_multiplication<T><<<grid, thds, 0, stream>>>(
-      output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
+      output, (T *)lwe_input->ptr, cleartext_input, num_entries);
  check_cuda_error(cudaGetLastError());
 }

--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -105,6 +105,55 @@ const _: () = {
        ms_input_variance
    ) - 32usize];
 };
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct CudaLweCiphertextListFFI {
+    pub ptr: *mut ffi::c_void,
+    pub num_radix_blocks: u32,
+    pub lwe_dimension: u32,
+}
+#[allow(clippy::unnecessary_operation, clippy::identity_op)]
+const _: () = {
+    ["Size of CudaLweCiphertextListFFI"]
+        [::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
+    ["Alignment of CudaLweCiphertextListFFI"]
+        [::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
+    ["Offset of field: CudaLweCiphertextListFFI::ptr"]
+        [::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
+    ["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
+        [::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
+    ["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
+        [::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
+};
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct CudaPackedGlweCiphertextListFFI {
+    pub ptr: *mut ffi::c_void,
+    pub storage_log_modulus: u32,
+    pub lwe_per_glwe: u32,
+    pub total_lwe_bodies_count: u32,
+    pub glwe_dimension: u32,
+    pub polynomial_size: u32,
+}
+#[allow(clippy::unnecessary_operation, clippy::identity_op)]
+const _: () = {
+    ["Size of CudaPackedGlweCiphertextListFFI"]
+        [::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
+    ["Alignment of CudaPackedGlweCiphertextListFFI"]
+        [::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
+};
 unsafe extern "C" {
    pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
        streams: *const *mut ffi::c_void,
@@ -121,7 +170,6 @@ unsafe extern "C" {
        carry_modulus: u32,
        pbs_type: PBS_TYPE,
        lwe_per_glwe: u32,
-        storage_log_modulus: u32,
        allocate_gpu_memory: bool,
    ) -> u64;
 }
@@ -138,12 +186,10 @@ unsafe extern "C" {
        lwe_dimension: u32,
        pbs_level: u32,
        pbs_base_log: u32,
-        num_radix_blocks: u32,
+        num_blocks_to_decompress: u32,
        message_modulus: u32,
        carry_modulus: u32,
        pbs_type: PBS_TYPE,
-        storage_log_modulus: u32,
-        body_count: u32,
        allocate_gpu_memory: bool,
        allocate_ms_array: bool,
    ) -> u64;
@@ -153,10 +199,9 @@ unsafe extern "C" {
        streams: *const *mut ffi::c_void,
        gpu_indexes: *const u32,
        gpu_count: u32,
-        glwe_array_out: *mut ffi::c_void,
-        lwe_array_in: *const ffi::c_void,
+        glwe_array_out: *mut CudaPackedGlweCiphertextListFFI,
+        lwe_array_in: *const CudaLweCiphertextListFFI,
        fp_ksk: *const *mut ffi::c_void,
-        num_nths: u32,
        mem_ptr: *mut i8,
    );
 }
@@ -165,10 +210,9 @@ unsafe extern "C" {
        streams: *const *mut ffi::c_void,
        gpu_indexes: *const u32,
        gpu_count: u32,
-        lwe_array_out: *mut ffi::c_void,
-        glwe_in: *const ffi::c_void,
+        lwe_array_out: *mut CudaLweCiphertextListFFI,
+        glwe_in: *const CudaPackedGlweCiphertextListFFI,
        indexes_array: *const u32,
-        indexes_array_size: u32,
        bsks: *const *mut ffi::c_void,
        mem_ptr: *mut i8,
    );
--- a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
+++ b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
@@ -261,7 +261,7 @@ impl CudaCompressedCiphertextList {
            v
        };

-        let mut num_bodies_left = gpu_meta.bodies_count;
+        let mut num_bodies_left = gpu_meta.total_lwe_bodies_count;
        let mut chunk_start = 0;
        while num_bodies_left != 0 {
            let bodies_count = LweCiphertextCount(num_bodies_left.min(lwe_per_glwe.0));
@@ -427,7 +427,7 @@ impl CompressedCiphertextList {
                    ciphertext_modulus: cpu_meta.ciphertext_modulus,
                    storage_log_modulus: first_ct.packed_integers().log_modulus(),
                    lwe_per_glwe,
-                    bodies_count: self.packed_list.len(),
+                    total_lwe_bodies_count: self.packed_list.len(),
                    initial_len,
                })
        });
--- a/tfhe/src/integer/gpu/list_compression/server_keys.rs
+++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs
@@ -49,7 +49,7 @@ pub struct CudaPackedGlweCiphertextListMeta {
    pub storage_log_modulus: CiphertextModulusLog,
    pub lwe_per_glwe: LweCiphertextCount,
    // Number of lwe bodies that are compressed in this list
-    pub bodies_count: usize,
+    pub total_lwe_bodies_count: usize,
    // Number of elements (u64) the uncompressed GLWE list had
    // keep in mind the last GLWE may not be full
    pub initial_len: usize,
@@ -69,7 +69,9 @@ impl CudaPackedGlweCiphertextList {

    pub fn bodies_count(&self) -> usize {
        // If there is no metadata, the list is empty
-        self.meta.map(|meta| meta.bodies_count).unwrap_or_default()
+        self.meta
+            .map(|meta| meta.total_lwe_bodies_count)
+            .unwrap_or_default()
    }

    pub fn glwe_ciphertext_count(&self) -> GlweCiphertextCount {
@@ -180,7 +182,7 @@ impl CudaCompressionKey {
        let uncompressed_len = num_glwes * glwe_mask_size + num_lwes;
        let number_bits_to_pack = uncompressed_len * self.storage_log_modulus.0;
        let compressed_len = number_bits_to_pack.div_ceil(u64::BITS as usize);
-        let mut packed_glwe_list = CudaVec::new(compressed_len, streams, 0);
+        let packed_glwe_list = CudaVec::new(compressed_len, streams, 0);

        if ciphertexts.is_empty() {
            return CudaPackedGlweCiphertextList {
@@ -194,16 +196,30 @@ impl CudaCompressionKey {
        let first_ct_info = first_ct.info.blocks.first().unwrap();
        let message_modulus = first_ct_info.message_modulus;
        let carry_modulus = first_ct_info.carry_modulus;
-
        let lwe_dimension = first_ct.d_blocks.lwe_dimension();

+        let mut glwe_array_out = CudaPackedGlweCiphertextList {
+            data: packed_glwe_list,
+            meta: Some(CudaPackedGlweCiphertextListMeta {
+                glwe_dimension: compressed_glwe_size.to_glwe_dimension(),
+                polynomial_size: compressed_polynomial_size,
+                message_modulus,
+                carry_modulus,
+                ciphertext_modulus,
+                storage_log_modulus: self.storage_log_modulus,
+                lwe_per_glwe: LweCiphertextCount(compressed_polynomial_size.0),
+                total_lwe_bodies_count: num_lwes,
+                initial_len: uncompressed_len,
+            }),
+        };
+
        unsafe {
            let input_lwes = Self::flatten_async(ciphertexts, streams);

            compress_integer_radix_async(
                streams,
-                &mut packed_glwe_list,
-                &input_lwes.0.d_vec,
+                &mut glwe_array_out,
+                &input_lwes,
                &self.packing_key_switching_key.d_vec,
                message_modulus,
                carry_modulus,
@@ -213,29 +229,13 @@ impl CudaCompressionKey {
                lwe_pksk.decomposition_base_log(),
                lwe_pksk.decomposition_level_count(),
                self.lwe_per_glwe.0 as u32,
-                self.storage_log_modulus.0 as u32,
                num_lwes as u32,
            );

            streams.synchronize();
        };

-        let meta = Some(CudaPackedGlweCiphertextListMeta {
-            glwe_dimension: compressed_glwe_size.to_glwe_dimension(),
-            polynomial_size: compressed_polynomial_size,
-            message_modulus,
-            carry_modulus,
-            ciphertext_modulus,
-            storage_log_modulus: self.storage_log_modulus,
-            lwe_per_glwe: LweCiphertextCount(compressed_polynomial_size.0),
-            bodies_count: num_lwes,
-            initial_len: uncompressed_len,
-        });
-
-        CudaPackedGlweCiphertextList {
-            data: packed_glwe_list,
-            meta,
-        }
+        glwe_array_out
    }
    pub fn get_compression_size_on_gpu(
        &self,
@@ -259,7 +259,6 @@ impl CudaCompressionKey {
            lwe_pksk.decomposition_base_log(),
            lwe_pksk.decomposition_level_count(),
            self.lwe_per_glwe.0 as u32,
-            self.storage_log_modulus.0 as u32,
            num_lwes,
        )
    }
@@ -308,7 +307,6 @@ impl CudaDecompressionKey {
        let message_modulus = self.message_modulus;
        let carry_modulus = self.carry_modulus;
        let ciphertext_modulus = self.ciphertext_modulus;
-        let storage_log_modulus = meta.storage_log_modulus;

        match &self.blind_rotate_key {
            CudaBootstrappingKey::Classic(bsk) => {
@@ -328,10 +326,9 @@ impl CudaDecompressionKey {
                unsafe {
                    decompress_integer_radix_async(
                        streams,
-                        &mut output_lwe.0.d_vec,
-                        &packed_list.data,
+                        &mut output_lwe,
+                        packed_list,
                        &bsk.d_vec,
-                        meta.bodies_count as u32,
                        message_modulus,
                        carry_modulus,
                        encryption_glwe_dimension,
@@ -341,7 +338,6 @@ impl CudaDecompressionKey {
                        lwe_dimension,
                        bsk.decomp_base_log(),
                        bsk.decomp_level_count(),
-                        storage_log_modulus.0 as u32,
                        indexes_array.as_slice(),
                        indexes_array_len.0 as u32,
                    );
@@ -403,7 +399,6 @@ impl CudaDecompressionKey {

        let message_modulus = self.message_modulus;
        let carry_modulus = self.carry_modulus;
-        let storage_log_modulus = meta.storage_log_modulus;

        match &self.blind_rotate_key {
            CudaBootstrappingKey::Classic(bsk) => {
@@ -415,7 +410,6 @@ impl CudaDecompressionKey {

                get_decompression_size_on_gpu(
                    streams,
-                    packed_list.bodies_count() as u32,
                    message_modulus,
                    carry_modulus,
                    encryption_glwe_dimension,
@@ -425,7 +419,6 @@ impl CudaDecompressionKey {
                    lwe_dimension,
                    bsk.decomp_base_log(),
                    bsk.decomp_level_count(),
-                    storage_log_modulus.0 as u32,
                    indexes_array_len.0 as u32,
                )
            }
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -21,6 +21,7 @@ use crate::core_crypto::prelude::{
 use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::{CudaRadixCiphertext, KsType};
+use crate::integer::gpu::list_compression::server_keys::CudaPackedGlweCiphertextList;
 use crate::integer::server_key::radix_parallel::scalar_div_mod::{
    choose_multiplier, SignedReciprocable,
 };
@@ -100,6 +101,29 @@ pub fn prepare_default_scalar_divisor() -> CudaScalarDivisorFFI {
    }
 }

+fn prepare_cuda_lwe_ct_ffi<T: UnsignedInteger>(
+    input: &CudaLweCiphertextList<T>,
+) -> CudaLweCiphertextListFFI {
+    CudaLweCiphertextListFFI {
+        ptr: input.0.d_vec.get_mut_c_ptr(0),
+        num_radix_blocks: input.0.lwe_ciphertext_count.0 as u32,
+        lwe_dimension: input.0.lwe_dimension.0 as u32,
+    }
+}
+
+fn prepare_cuda_packed_glwe_ct_ffi(
+    input: &CudaPackedGlweCiphertextList,
+) -> CudaPackedGlweCiphertextListFFI {
+    CudaPackedGlweCiphertextListFFI {
+        ptr: input.data.get_mut_c_ptr(0),
+        storage_log_modulus: input.meta.unwrap().storage_log_modulus.0 as u32,
+        lwe_per_glwe: input.meta.unwrap().lwe_per_glwe.0 as u32,
+        total_lwe_bodies_count: input.meta.unwrap().total_lwe_bodies_count as u32,
+        glwe_dimension: input.meta.unwrap().glwe_dimension.0 as u32,
+        polynomial_size: input.meta.unwrap().polynomial_size.0 as u32,
+    }
+}
+
 // If we build the Vec<u64> inside prepare_cuda_radix_ffi
 // the data gets dropped before the call to the Cuda function,
 // and we get memory errors, hence why the reconstruction of
@@ -711,8 +735,8 @@ where
 ///   is required
 pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
    streams: &CudaStreams,
-    glwe_array_out: &mut CudaVec<T>,
-    lwe_array_in: &CudaVec<T>,
+    glwe_array_out: &mut CudaPackedGlweCiphertextList,
+    lwe_array_in: &CudaLweCiphertextList<T>,
    fp_keyswitch_key: &CudaVec<u64>,
    message_modulus: MessageModulus,
    carry_modulus: CarryModulus,
@@ -722,23 +746,8 @@ pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
    ks_base_log: DecompositionBaseLog,
    ks_level: DecompositionLevelCount,
    lwe_per_glwe: u32,
-    storage_log_modulus: u32,
    num_blocks: u32,
 ) {
-    assert_eq!(
-        streams.gpu_indexes[0],
-        glwe_array_out.gpu_index(0),
-        "GPU error: first stream is on GPU {}, first glwe output pointer is on GPU {}",
-        streams.gpu_indexes[0].get(),
-        glwe_array_out.gpu_index(0).get(),
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        lwe_array_in.gpu_index(0),
-        "GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
-        streams.gpu_indexes[0].get(),
-        lwe_array_in.gpu_index(0).get(),
-    );
    assert_eq!(
        streams.gpu_indexes[0],
        fp_keyswitch_key.gpu_index(0),
@@ -746,7 +755,25 @@ pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
        streams.gpu_indexes[0].get(),
        fp_keyswitch_key.gpu_index(0).get(),
    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        lwe_array_in.0.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first output pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        lwe_array_in.0.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        glwe_array_out.data.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        glwe_array_out.data.gpu_index(0).get(),
+    );
    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+
+    let array_in_ffi = prepare_cuda_lwe_ct_ffi(lwe_array_in);
+    let mut glwe_array_out_ffi = prepare_cuda_packed_glwe_ct_ffi(glwe_array_out);
+
    scratch_cuda_integer_compress_radix_ciphertext_64(
        streams.ptr.as_ptr(),
        streams.gpu_indexes_ptr(),
@@ -762,7 +789,6 @@ pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
        carry_modulus.0 as u32,
        PBSType::Classical as u32,
        lwe_per_glwe,
-        storage_log_modulus,
        true,
    );

@@ -770,10 +796,9 @@ pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
        streams.ptr.as_ptr(),
        streams.gpu_indexes_ptr(),
        streams.len() as u32,
-        glwe_array_out.as_mut_c_ptr(0),
-        lwe_array_in.as_c_ptr(0),
+        &raw mut glwe_array_out_ffi,
+        &raw const array_in_ffi,
        fp_keyswitch_key.ptr.as_ptr(),
-        num_blocks,
        mem_ptr,
    );

@@ -796,7 +821,6 @@ pub fn get_compression_size_on_gpu(
    ks_base_log: DecompositionBaseLog,
    ks_level: DecompositionLevelCount,
    lwe_per_glwe: u32,
-    storage_log_modulus: u32,
    num_blocks: u32,
 ) -> u64 {
    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
@@ -816,7 +840,6 @@ pub fn get_compression_size_on_gpu(
            carry_modulus.0 as u32,
            PBSType::Classical as u32,
            lwe_per_glwe,
-            storage_log_modulus,
            false,
        )
    };
@@ -839,10 +862,9 @@ pub fn get_compression_size_on_gpu(
 ///   is required
 pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
-    lwe_array_out: &mut CudaVec<T>,
-    glwe_in: &CudaVec<T>,
+    lwe_array_out: &mut CudaLweCiphertextList<T>,
+    glwe_in: &CudaPackedGlweCiphertextList,
    bootstrapping_key: &CudaVec<B>,
-    bodies_count: u32,
    message_modulus: MessageModulus,
    carry_modulus: CarryModulus,
    encryption_glwe_dimension: GlweDimension,
@@ -852,23 +874,22 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
    lwe_dimension: LweDimension,
    pbs_base_log: DecompositionBaseLog,
    pbs_level: DecompositionLevelCount,
-    storage_log_modulus: u32,
    vec_indexes: &[u32],
-    num_lwes: u32,
+    num_blocks_to_decompress: u32,
 ) {
    assert_eq!(
        streams.gpu_indexes[0],
-        lwe_array_out.gpu_index(0),
+        lwe_array_out.0.d_vec.gpu_index(0),
        "GPU error: first stream is on GPU {}, first output pointer is on GPU {}",
        streams.gpu_indexes[0].get(),
-        lwe_array_out.gpu_index(0).get(),
+        lwe_array_out.0.d_vec.gpu_index(0).get(),
    );
    assert_eq!(
        streams.gpu_indexes[0],
-        glwe_in.gpu_index(0),
+        glwe_in.data.gpu_index(0),
        "GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
        streams.gpu_indexes[0].get(),
-        glwe_in.gpu_index(0).get(),
+        glwe_in.data.gpu_index(0).get(),
    );
    assert_eq!(
        streams.gpu_indexes[0],
@@ -878,6 +899,10 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
        bootstrapping_key.gpu_index(0).get(),
    );
    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+
+    let mut lwe_array_out_ffi = prepare_cuda_lwe_ct_ffi(lwe_array_out);
+    let glwe_array_in_ffi = prepare_cuda_packed_glwe_ct_ffi(glwe_in);
+
    scratch_cuda_integer_decompress_radix_ciphertext_64(
        streams.ptr.as_ptr(),
        streams.gpu_indexes_ptr(),
@@ -890,12 +915,10 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
        lwe_dimension.0 as u32,
        pbs_level.0 as u32,
        pbs_base_log.0 as u32,
-        num_lwes,
+        num_blocks_to_decompress,
        message_modulus.0 as u32,
        carry_modulus.0 as u32,
        PBSType::Classical as u32,
-        storage_log_modulus,
-        bodies_count,
        true,
        false,
    );
@@ -904,10 +927,9 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
        streams.ptr.as_ptr(),
        streams.gpu_indexes_ptr(),
        streams.len() as u32,
-        lwe_array_out.as_mut_c_ptr(0),
-        glwe_in.as_c_ptr(0),
+        &raw mut lwe_array_out_ffi,
+        &raw const glwe_array_in_ffi,
        vec_indexes.as_ptr(),
-        vec_indexes.len() as u32,
        bootstrapping_key.ptr.as_ptr(),
        mem_ptr,
    );
@@ -923,7 +945,6 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
 #[allow(clippy::too_many_arguments)]
 pub fn get_decompression_size_on_gpu(
    streams: &CudaStreams,
-    bodies_count: u32,
    message_modulus: MessageModulus,
    carry_modulus: CarryModulus,
    encryption_glwe_dimension: GlweDimension,
@@ -933,8 +954,7 @@ pub fn get_decompression_size_on_gpu(
    lwe_dimension: LweDimension,
    pbs_base_log: DecompositionBaseLog,
    pbs_level: DecompositionLevelCount,
-    storage_log_modulus: u32,
-    num_lwes: u32,
+    num_blocks_to_decompress: u32,
 ) -> u64 {
    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
    let size_tracker = unsafe {
@@ -950,12 +970,10 @@ pub fn get_decompression_size_on_gpu(
            lwe_dimension.0 as u32,
            pbs_level.0 as u32,
            pbs_base_log.0 as u32,
-            num_lwes,
+            num_blocks_to_decompress,
            message_modulus.0 as u32,
            carry_modulus.0 as u32,
            PBSType::Classical as u32,
-            storage_log_modulus,
-            bodies_count,
            false,
            false,
        )