works again

2026-01-10 07:08:03 -05:00 · 2025-08-28 14:58:51 +04:00
parent 1ef18c81f5
commit e87f35ba4f
5 changed files with 215 additions and 208 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -2,27 +2,7 @@
 #define CUDA_INTEGER_COMPRESSION_H

 #include "../../pbs/pbs_enums.h"
-
-typedef struct {
-  void *ptr;
-  uint32_t num_radix_blocks;
-  uint32_t lwe_dimension;
-} CudaLweCiphertextListFFI;
-
-typedef struct {
-  void *ptr;
-  uint32_t storage_log_modulus;
-  uint32_t lwe_per_glwe;
-  // Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
-  // smaller)
-  // Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
-  // each LWE of the group). In the end the total number of bodies is equal to
-  // the number of input LWE
-  uint32_t total_lwe_bodies_count;
-  uint32_t glwe_dimension;
-  uint32_t polynomial_size;
-} CudaPackedGlweCiphertextListFFI;
-
+#include "../integer.h"
 extern "C" {
 uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -80,6 +80,26 @@ typedef struct {
  bool const divisor_has_more_bits_than_numerator;
 } CudaScalarDivisorFFI;

+typedef struct {
+  void *ptr;
+  uint32_t num_radix_blocks;
+  uint32_t lwe_dimension;
+} CudaLweCiphertextListFFI;
+
+typedef struct {
+  void *ptr;
+  uint32_t storage_log_modulus;
+  uint32_t lwe_per_glwe;
+  // Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
+  // smaller)
+  // Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
+  // each LWE of the group). In the end the total number of bodies is equal to
+  // the number of input LWE
+  uint32_t total_lwe_bodies_count;
+  uint32_t glwe_dimension;
+  uint32_t polynomial_size;
+} CudaPackedGlweCiphertextListFFI;
+
 uint64_t scratch_cuda_apply_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -107,9 +107,9 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
    cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
  }

-  print_body<Torus>("remainder", (Torus *)remainder->ptr,
-                    remainder->num_radix_blocks, radix_params.big_lwe_dimension,
-                    576460752303423488ULL);
+//  print_body<Torus>("remainder", (Torus *)remainder->ptr,
+//                    remainder->num_radix_blocks, radix_params.big_lwe_dimension,
+//                    576460752303423488ULL);

  for (int block_index = num_blocks - 1; block_index >= 0; block_index--) {
    uint32_t slice_len = num_blocks - block_index;
@@ -128,16 +128,16 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
                                             slice_len, remainder, block_index,
                                             num_blocks);

-    if (slice_len == 4) {
-      print_body<Torus>("low1", (Torus *)low1->ptr, low1->num_radix_blocks,
-                        radix_params.big_lwe_dimension, 576460752303423488ULL);
-      print_body<Torus>("low2", (Torus *)low2->ptr, low2->num_radix_blocks,
-                        radix_params.big_lwe_dimension, 576460752303423488ULL);
-      print_body<Torus>("low3", (Torus *)low3->ptr, low3->num_radix_blocks,
-                        radix_params.big_lwe_dimension, 576460752303423488ULL);
-      print_body<Torus>("rem", (Torus *)rem->ptr, rem->num_radix_blocks,
-                        radix_params.big_lwe_dimension, 576460752303423488ULL);
-    }
+//    if (slice_len == 4) {
+//      print_body<Torus>("low1", (Torus *)low1->ptr, low1->num_radix_blocks,
+//                        radix_params.big_lwe_dimension, 576460752303423488ULL);
+//      print_body<Torus>("low2", (Torus *)low2->ptr, low2->num_radix_blocks,
+//                        radix_params.big_lwe_dimension, 576460752303423488ULL);
+//      print_body<Torus>("low3", (Torus *)low3->ptr, low3->num_radix_blocks,
+//                        radix_params.big_lwe_dimension, 576460752303423488ULL);
+//      print_body<Torus>("rem", (Torus *)rem->ptr, rem->num_radix_blocks,
+//                        radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    }
    uint32_t compute_borrow = 1;
    uint32_t uses_input_borrow = 0;
    auto sub_result_f = [&](cudaStream_t const *streams,
@@ -223,12 +223,12 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
    auto o2 = mem_ptr->sub_2_overflowed;
    auto o3 = mem_ptr->sub_1_overflowed;

-    print_body<Torus>("r1", (Torus *)r1->ptr, r1->num_radix_blocks,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
-    print_body<Torus>("r2", (Torus *)r2->ptr, r2->num_radix_blocks,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
-    print_body<Torus>("r3", (Torus *)r3->ptr, r3->num_radix_blocks,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("r1", (Torus *)r1->ptr, r1->num_radix_blocks,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("r2", (Torus *)r2->ptr, r2->num_radix_blocks,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("r3", (Torus *)r3->ptr, r3->num_radix_blocks,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);

    // used as a bitor
    host_integer_radix_bitop_kb(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
@@ -248,19 +248,19 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
    }

-    print_body<Torus>("o1", (Torus *)o1->ptr, 1, radix_params.big_lwe_dimension,
-                      576460752303423488ULL);
-    print_body<Torus>("o2", (Torus *)o2->ptr, 1, radix_params.big_lwe_dimension,
-                      576460752303423488ULL);
-    print_body<Torus>("o3", (Torus *)o3->ptr, 1, radix_params.big_lwe_dimension,
-                      576460752303423488ULL);
-
-    print_body<Torus>("cmp1", (Torus *)mem_ptr->cmp_1->ptr, 1,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
-    print_body<Torus>("cmp2", (Torus *)mem_ptr->cmp_2->ptr, 1,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
-    print_body<Torus>("cmp3", (Torus *)mem_ptr->cmp_3->ptr, 1,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("o1", (Torus *)o1->ptr, 1, radix_params.big_lwe_dimension,
+//                      576460752303423488ULL);
+//    print_body<Torus>("o2", (Torus *)o2->ptr, 1, radix_params.big_lwe_dimension,
+//                      576460752303423488ULL);
+//    print_body<Torus>("o3", (Torus *)o3->ptr, 1, radix_params.big_lwe_dimension,
+//                      576460752303423488ULL);
+//
+//    print_body<Torus>("cmp1", (Torus *)mem_ptr->cmp_1->ptr, 1,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("cmp2", (Torus *)mem_ptr->cmp_2->ptr, 1,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("cmp3", (Torus *)mem_ptr->cmp_3->ptr, 1,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);

    // The cx variables tell whether the corresponding result of the subtraction
    // should be kept, and what value the quotient block should have
@@ -312,24 +312,24 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
                                             mem_ptr->c0, 0, 1, o1, 0, 1);

-    print_body<Torus>("c0", (Torus *)mem_ptr->c0->ptr, 1,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
-    print_body<Torus>("c1", (Torus *)mem_ptr->c1->ptr, 1,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
-    print_body<Torus>("c2", (Torus *)mem_ptr->c2->ptr, 1,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
-    print_body<Torus>("c3", (Torus *)mem_ptr->c3->ptr, 1,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("c0", (Torus *)mem_ptr->c0->ptr, 1,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("c1", (Torus *)mem_ptr->c1->ptr, 1,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("c2", (Torus *)mem_ptr->c2->ptr, 1,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("c3", (Torus *)mem_ptr->c3->ptr, 1,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);

    auto conditional_update =
        [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
            uint32_t gpu_count, CudaRadixCiphertextFFI *cx,
            CudaRadixCiphertextFFI *rx, int_radix_lut<Torus> *lut,
-            uint32_t factor) {
+            Torus factor) {
 //      printf("rx->num_radix_blocks: %d\n", rx->num_radix_blocks);
+          auto rx_list = to_lwe_ciphertext_list(rx);
          host_cleartext_multiplication<Torus>(
-              streams[0], gpu_indexes[0], (Torus *)rx->ptr, (Torus *)rx->ptr, factor,
-              radix_params.big_lwe_dimension, rx->num_radix_blocks);
+              streams[0], gpu_indexes[0], (Torus *)rx->ptr, &rx_list, factor);
          host_add_the_same_block_to_all_blocks<Torus>(streams[0], gpu_indexes[0], rx,
                                                rx, cx, 4, 4);

@@ -385,21 +385,21 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
      cuda_synchronize_stream(mem_ptr->sub_streams_7[j], gpu_indexes[j]);
    }

-    print_body<Torus>("gpu_after_r1", (Torus *)r1->ptr, r1->num_radix_blocks,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
-    print_body<Torus>("gpu_after_r2", (Torus *)r2->ptr, r2->num_radix_blocks,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
-    print_body<Torus>("gpu_after_r3", (Torus *)r3->ptr, r3->num_radix_blocks,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
-    print_body<Torus>("gpu_after_rem", (Torus *)rem->ptr, rem->num_radix_blocks,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
-
-    print_body<Torus>("gpu_after_q1", (Torus *)mem_ptr->q1->ptr, mem_ptr->q1->num_radix_blocks,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
-    print_body<Torus>("gpu_after_q2", (Torus *)mem_ptr->q2->ptr, mem_ptr->q2->num_radix_blocks,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
-    print_body<Torus>("gpu_after_q3", (Torus *)mem_ptr->q3->ptr, mem_ptr->q3->num_radix_blocks,
-                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("gpu_after_r1", (Torus *)r1->ptr, r1->num_radix_blocks,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("gpu_after_r2", (Torus *)r2->ptr, r2->num_radix_blocks,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("gpu_after_r3", (Torus *)r3->ptr, r3->num_radix_blocks,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("gpu_after_rem", (Torus *)rem->ptr, rem->num_radix_blocks,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//
+//    print_body<Torus>("gpu_after_q1", (Torus *)mem_ptr->q1->ptr, mem_ptr->q1->num_radix_blocks,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("gpu_after_q2", (Torus *)mem_ptr->q2->ptr, mem_ptr->q2->num_radix_blocks,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);
+//    print_body<Torus>("gpu_after_q3", (Torus *)mem_ptr->q3->ptr, mem_ptr->q3->num_radix_blocks,
+//                      radix_params.big_lwe_dimension, 576460752303423488ULL);

    host_addition<Torus>(streams[0], gpu_indexes[0], rem, rem,
                         r3, rem->num_radix_blocks, 4, 4);
@@ -431,7 +431,7 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
    rem->num_radix_blocks = remainder->num_radix_blocks;
    copy_radix_ciphertext_slice_async<Torus>(
        streams[0], gpu_indexes[0],
-        remainder, block_index, rem->num_radix_blocks,
+        remainder, block_index, remainder->num_radix_blocks,
        rem, 0, tmp_rem_size);
    rem->num_radix_blocks = tmp_rem_size;

--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -7,6 +7,13 @@
 #include "utils/helper_profile.cuh"
 #include "utils/kernel_dimensions.cuh"

+inline CudaLweCiphertextListFFI to_lwe_ciphertext_list(CudaRadixCiphertextFFI* radix) {
+  return {
+      .ptr = radix->ptr,
+      .num_radix_blocks = radix->num_radix_blocks,
+      .lwe_dimension = radix->lwe_dimension
+  };
+}
 template <typename Torus>
 void create_zero_radix_ciphertext_async(cudaStream_t const stream,
                                        uint32_t const gpu_index,
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -105,134 +105,6 @@ const _: () = {
        ms_input_variance
    ) - 32usize];
 };
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct CudaLweCiphertextListFFI {
-    pub ptr: *mut ffi::c_void,
-    pub num_radix_blocks: u32,
-    pub lwe_dimension: u32,
-}
-#[allow(clippy::unnecessary_operation, clippy::identity_op)]
-const _: () = {
-    ["Size of CudaLweCiphertextListFFI"]
-        [::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
-    ["Alignment of CudaLweCiphertextListFFI"]
-        [::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
-    ["Offset of field: CudaLweCiphertextListFFI::ptr"]
-        [::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
-    ["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
-        [::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
-    ["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
-        [::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
-};
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct CudaPackedGlweCiphertextListFFI {
-    pub ptr: *mut ffi::c_void,
-    pub storage_log_modulus: u32,
-    pub lwe_per_glwe: u32,
-    pub total_lwe_bodies_count: u32,
-    pub glwe_dimension: u32,
-    pub polynomial_size: u32,
-}
-#[allow(clippy::unnecessary_operation, clippy::identity_op)]
-const _: () = {
-    ["Size of CudaPackedGlweCiphertextListFFI"]
-        [::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
-    ["Alignment of CudaPackedGlweCiphertextListFFI"]
-        [::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
-};
-unsafe extern "C" {
-    pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr: *mut *mut i8,
-        compression_glwe_dimension: u32,
-        compression_polynomial_size: u32,
-        lwe_dimension: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        num_radix_blocks: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: PBS_TYPE,
-        lwe_per_glwe: u32,
-        allocate_gpu_memory: bool,
-    ) -> u64;
-}
-unsafe extern "C" {
-    pub fn scratch_cuda_integer_decompress_radix_ciphertext_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr: *mut *mut i8,
-        encryption_glwe_dimension: u32,
-        encryption_polynomial_size: u32,
-        compression_glwe_dimension: u32,
-        compression_polynomial_size: u32,
-        lwe_dimension: u32,
-        pbs_level: u32,
-        pbs_base_log: u32,
-        num_blocks_to_decompress: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: PBS_TYPE,
-        allocate_gpu_memory: bool,
-        allocate_ms_array: bool,
-    ) -> u64;
-}
-unsafe extern "C" {
-    pub fn cuda_integer_compress_radix_ciphertext_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        glwe_array_out: *mut CudaPackedGlweCiphertextListFFI,
-        lwe_array_in: *const CudaLweCiphertextListFFI,
-        fp_ksk: *const *mut ffi::c_void,
-        mem_ptr: *mut i8,
-    );
-}
-unsafe extern "C" {
-    pub fn cuda_integer_decompress_radix_ciphertext_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        lwe_array_out: *mut CudaLweCiphertextListFFI,
-        glwe_in: *const CudaPackedGlweCiphertextListFFI,
-        indexes_array: *const u32,
-        bsks: *const *mut ffi::c_void,
-        mem_ptr: *mut i8,
-    );
-}
-unsafe extern "C" {
-    pub fn cleanup_cuda_integer_compress_radix_ciphertext_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr_void: *mut *mut i8,
-    );
-}
-unsafe extern "C" {
-    pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr_void: *mut *mut i8,
-    );
-}
 pub const SHIFT_OR_ROTATE_TYPE_LEFT_SHIFT: SHIFT_OR_ROTATE_TYPE = 0;
 pub const SHIFT_OR_ROTATE_TYPE_RIGHT_SHIFT: SHIFT_OR_ROTATE_TYPE = 1;
 pub const SHIFT_OR_ROTATE_TYPE_LEFT_ROTATE: SHIFT_OR_ROTATE_TYPE = 2;
@@ -367,6 +239,55 @@ const _: () = {
        divisor_has_more_bits_than_numerator
    ) - 60usize];
 };
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct CudaLweCiphertextListFFI {
+    pub ptr: *mut ffi::c_void,
+    pub num_radix_blocks: u32,
+    pub lwe_dimension: u32,
+}
+#[allow(clippy::unnecessary_operation, clippy::identity_op)]
+const _: () = {
+    ["Size of CudaLweCiphertextListFFI"]
+        [::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
+    ["Alignment of CudaLweCiphertextListFFI"]
+        [::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
+    ["Offset of field: CudaLweCiphertextListFFI::ptr"]
+        [::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
+    ["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
+        [::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
+    ["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
+        [::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
+};
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct CudaPackedGlweCiphertextListFFI {
+    pub ptr: *mut ffi::c_void,
+    pub storage_log_modulus: u32,
+    pub lwe_per_glwe: u32,
+    pub total_lwe_bodies_count: u32,
+    pub glwe_dimension: u32,
+    pub polynomial_size: u32,
+}
+#[allow(clippy::unnecessary_operation, clippy::identity_op)]
+const _: () = {
+    ["Size of CudaPackedGlweCiphertextListFFI"]
+        [::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
+    ["Alignment of CudaPackedGlweCiphertextListFFI"]
+        [::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
+};
 unsafe extern "C" {
    pub fn scratch_cuda_apply_univariate_lut_kb_64(
        streams: *const *mut ffi::c_void,
@@ -1934,6 +1855,85 @@ unsafe extern "C" {
        mem_ptr_void: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        compression_glwe_dimension: u32,
+        compression_polynomial_size: u32,
+        lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        num_radix_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        lwe_per_glwe: u32,
+        allocate_gpu_memory: bool,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn scratch_cuda_integer_decompress_radix_ciphertext_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        encryption_glwe_dimension: u32,
+        encryption_polynomial_size: u32,
+        compression_glwe_dimension: u32,
+        compression_polynomial_size: u32,
+        lwe_dimension: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        num_blocks_to_decompress: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        allocate_ms_array: bool,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_integer_compress_radix_ciphertext_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        glwe_array_out: *mut CudaPackedGlweCiphertextListFFI,
+        lwe_array_in: *const CudaLweCiphertextListFFI,
+        fp_ksk: *const *mut ffi::c_void,
+        mem_ptr: *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn cuda_integer_decompress_radix_ciphertext_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array_out: *mut CudaLweCiphertextListFFI,
+        glwe_in: *const CudaPackedGlweCiphertextListFFI,
+        indexes_array: *const u32,
+        bsks: *const *mut ffi::c_void,
+        mem_ptr: *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_integer_compress_radix_ciphertext_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
 pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
 pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
 pub type KS_TYPE = ffi::c_uint;