feat(gpu): aes 128

2026-01-07 22:04:10 -05:00 · 2025-08-26 16:07:54 +02:00
parent 0604d237eb
commit f0f3dd76eb
17 changed files with 3192 additions and 0 deletions
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -84,6 +84,7 @@ fn main() {
            "cuda/include/ciphertext.h",
            "cuda/include/integer/compression/compression.h",
            "cuda/include/integer/integer.h",
+            "cuda/include/aes/aes.h",
            "cuda/include/zk/zk.h",
            "cuda/include/keyswitch/keyswitch.h",
            "cuda/include/keyswitch/ks_enums.h",
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes.h
@@ -0,0 +1,44 @@
+#ifndef AES_H
+#define AES_H
+#include "../integer/integer.h"
+
+extern "C" {
+uint64_t scratch_cuda_integer_aes_encrypt_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
+    uint32_t sbox_parallelism);
+
+void cuda_integer_aes_ctr_encrypt_64(CudaStreamsFFI streams,
+                                     CudaRadixCiphertextFFI *output,
+                                     CudaRadixCiphertextFFI const *iv,
+                                     CudaRadixCiphertextFFI const *round_keys,
+                                     const uint64_t *counter_bits_le_all_blocks,
+                                     uint32_t num_aes_inputs, int8_t *mem_ptr,
+                                     void *const *bsks, void *const *ksks);
+
+void cleanup_cuda_integer_aes_encrypt_64(CudaStreamsFFI streams,
+                                         int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_integer_key_expansion_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_integer_key_expansion_64(CudaStreamsFFI streams,
+                                   CudaRadixCiphertextFFI *expanded_keys,
+                                   CudaRadixCiphertextFFI const *key,
+                                   int8_t *mem_ptr, void *const *bsks,
+                                   void *const *ksks);
+
+void cleanup_cuda_integer_key_expansion_64(CudaStreamsFFI streams,
+                                           int8_t **mem_ptr_void);
+}
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
@@ -0,0 +1,440 @@
+#ifndef AES_UTILITIES
+#define AES_UTILITIES
+#include "../integer/integer_utilities.h"
+
+/**
+ * This structure holds pre-computed LUTs for essential bitwise operations
+ * required by the homomorphic AES circuit. Pre-computing these tables allows
+ * for efficient application of non-linear functions like AND during the PBS
+ * process. It includes LUTs for:
+ * - AND: for the non-linear part of the S-Box.
+ * - FLUSH: to clear carry bits and isolate the message bit (x -> x & 1).
+ * - CARRY: to extract the carry bit for additions (x -> (x >> 1) & 1).
+ */
+template <typename Torus> struct int_aes_lut_buffers {
+  int_radix_lut<Torus> *and_lut;
+  int_radix_lut<Torus> *flush_lut;
+  int_radix_lut<Torus> *carry_lut;
+
+  int_aes_lut_buffers(CudaStreams streams, const int_radix_params &params,
+                      bool allocate_gpu_memory, uint32_t num_aes_inputs,
+                      uint32_t sbox_parallelism, uint64_t &size_tracker) {
+
+    constexpr uint32_t AES_STATE_BITS = 128;
+    constexpr uint32_t SBOX_MAX_AND_GATES = 18;
+
+    this->and_lut = new int_radix_lut<Torus>(
+        streams, params, 1,
+        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
+        allocate_gpu_memory, size_tracker);
+    std::function<Torus(Torus, Torus)> and_lambda =
+        [](Torus a, Torus b) -> Torus { return a & b; };
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
+        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, and_lambda, allocate_gpu_memory);
+    auto active_streams_and_lut = streams.active_gpu_subset(
+        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism);
+    this->and_lut->broadcast_lut(active_streams_and_lut);
+
+    this->flush_lut = new int_radix_lut<Torus>(
+        streams, params, 1, AES_STATE_BITS * num_aes_inputs,
+        allocate_gpu_memory, size_tracker);
+    std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
+      return x & 1;
+    };
+    generate_device_accumulator(
+        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
+        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, flush_lambda, allocate_gpu_memory);
+    auto active_streams_flush_lut =
+        streams.active_gpu_subset(AES_STATE_BITS * num_aes_inputs);
+    this->flush_lut->broadcast_lut(active_streams_flush_lut);
+
+    this->carry_lut = new int_radix_lut<Torus>(
+        streams, params, 1, num_aes_inputs, allocate_gpu_memory, size_tracker);
+    std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
+      return (x >> 1) & 1;
+    };
+    generate_device_accumulator(
+        streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
+        this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, carry_lambda, allocate_gpu_memory);
+    auto active_streams_carry_lut = streams.active_gpu_subset(num_aes_inputs);
+    this->carry_lut->broadcast_lut(active_streams_carry_lut);
+  }
+
+  void release(CudaStreams streams) {
+    this->and_lut->release(streams);
+    delete this->and_lut;
+    this->and_lut = nullptr;
+
+    this->flush_lut->release(streams);
+    delete this->flush_lut;
+    this->flush_lut = nullptr;
+
+    this->carry_lut->release(streams);
+    delete this->carry_lut;
+    this->carry_lut = nullptr;
+  }
+};
+
+/**
+ * The operations within an AES round, particularly MixColumns, require
+ * intermediate storage for calculations. These buffers are designed to hold
+ * temporary values like copies of columns or the results of multiplications,
+ * avoiding overwriting data that is still needed in the same round.
+ */
+template <typename Torus> struct int_aes_round_workspaces {
+  CudaRadixCiphertextFFI *mix_columns_col_copy_buffer;
+  CudaRadixCiphertextFFI *mix_columns_mul_workspace_buffer;
+  CudaRadixCiphertextFFI *vec_tmp_bit_buffer;
+
+  int_aes_round_workspaces(CudaStreams streams, const int_radix_params &params,
+                           bool allocate_gpu_memory, uint32_t num_aes_inputs,
+                           uint64_t &size_tracker) {
+
+    constexpr uint32_t BITS_PER_BYTE = 8;
+    constexpr uint32_t BYTES_PER_COLUMN = 4;
+    constexpr uint32_t BITS_PER_COLUMN = BITS_PER_BYTE * BYTES_PER_COLUMN;
+    constexpr uint32_t MIX_COLUMNS_MUL_WORKSPACE_BYTES = BYTES_PER_COLUMN + 1;
+
+    this->mix_columns_col_copy_buffer = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->mix_columns_col_copy_buffer, BITS_PER_COLUMN * num_aes_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->mix_columns_mul_workspace_buffer = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->mix_columns_mul_workspace_buffer,
+        MIX_COLUMNS_MUL_WORKSPACE_BYTES * BITS_PER_BYTE * num_aes_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->vec_tmp_bit_buffer = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->vec_tmp_bit_buffer,
+        num_aes_inputs, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+  }
+
+  void release(CudaStreams streams, bool allocate_gpu_memory) {
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->mix_columns_col_copy_buffer,
+                                   allocate_gpu_memory);
+    delete this->mix_columns_col_copy_buffer;
+    this->mix_columns_col_copy_buffer = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->mix_columns_mul_workspace_buffer,
+                                   allocate_gpu_memory);
+    delete this->mix_columns_mul_workspace_buffer;
+    this->mix_columns_mul_workspace_buffer = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->vec_tmp_bit_buffer,
+                                   allocate_gpu_memory);
+    delete this->vec_tmp_bit_buffer;
+    this->vec_tmp_bit_buffer = nullptr;
+  }
+};
+
+/**
+ * In CTR mode, a counter is homomorphically added to the encrypted IV. This
+ * structure holds the necessary buffers for this 128-bit ripple-carry
+ * addition, such as the buffer for the propagating carry bit
+ * (`vec_tmp_carry_buffer`) across the addition chain.
+ */
+template <typename Torus> struct int_aes_counter_workspaces {
+  CudaRadixCiphertextFFI *vec_tmp_carry_buffer;
+  CudaRadixCiphertextFFI *vec_tmp_sum_buffer;
+  CudaRadixCiphertextFFI *vec_trivial_b_bits_buffer;
+  Torus *h_counter_bits_buffer;
+  Torus *d_counter_bits_buffer;
+
+  int_aes_counter_workspaces(CudaStreams streams,
+                             const int_radix_params &params,
+                             bool allocate_gpu_memory, uint32_t num_aes_inputs,
+                             uint64_t &size_tracker) {
+
+    this->vec_tmp_carry_buffer = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->vec_tmp_carry_buffer,
+        num_aes_inputs, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->vec_tmp_sum_buffer = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->vec_tmp_sum_buffer,
+        num_aes_inputs, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->vec_trivial_b_bits_buffer = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->vec_trivial_b_bits_buffer, num_aes_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->h_counter_bits_buffer =
+        (Torus *)malloc(num_aes_inputs * sizeof(Torus));
+    size_tracker += num_aes_inputs * sizeof(Torus);
+    this->d_counter_bits_buffer = (Torus *)cuda_malloc_with_size_tracking_async(
+        num_aes_inputs * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+        size_tracker, allocate_gpu_memory);
+  }
+
+  void release(CudaStreams streams, bool allocate_gpu_memory) {
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->vec_tmp_carry_buffer,
+                                   allocate_gpu_memory);
+    delete this->vec_tmp_carry_buffer;
+    this->vec_tmp_carry_buffer = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->vec_tmp_sum_buffer,
+                                   allocate_gpu_memory);
+    delete this->vec_tmp_sum_buffer;
+    this->vec_tmp_sum_buffer = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->vec_trivial_b_bits_buffer,
+                                   allocate_gpu_memory);
+    delete this->vec_trivial_b_bits_buffer;
+    this->vec_trivial_b_bits_buffer = nullptr;
+
+    free(this->h_counter_bits_buffer);
+    if (allocate_gpu_memory) {
+      cuda_drop_async(this->d_counter_bits_buffer, streams.stream(0),
+                      streams.gpu_index(0));
+      streams.synchronize();
+    }
+  }
+};
+
+/**
+ * This structure allocates the most significant memory blocks:
+ * - `sbox_internal_workspace`: A large workspace for the complex, parallel
+ * evaluation of the S-Box circuit.
+ * - `main_bitsliced_states_buffer`: Holds the entire set of AES states in a
+ * bitsliced layout, which is optimal for parallel bitwise operations on the
+ * GPU.
+ * - Other buffers are used for data layout transformations (transposition) and
+ * for batching small operations into larger, more efficient launches.
+ */
+template <typename Torus> struct int_aes_main_workspaces {
+  CudaRadixCiphertextFFI *sbox_internal_workspace;
+  CudaRadixCiphertextFFI *initial_states_and_jit_key_workspace;
+  CudaRadixCiphertextFFI *main_bitsliced_states_buffer;
+  CudaRadixCiphertextFFI *tmp_tiled_key_buffer;
+  CudaRadixCiphertextFFI *batch_processing_buffer;
+
+  int_aes_main_workspaces(CudaStreams streams, const int_radix_params &params,
+                          bool allocate_gpu_memory, uint32_t num_aes_inputs,
+                          uint32_t sbox_parallelism, uint64_t &size_tracker) {
+
+    constexpr uint32_t AES_STATE_BITS = 128;
+    constexpr uint32_t SBOX_MAX_AND_GATES = 18;
+    constexpr uint32_t BATCH_BUFFER_OPERANDS = 3;
+
+    this->sbox_internal_workspace = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->sbox_internal_workspace,
+        num_aes_inputs * AES_STATE_BITS * sbox_parallelism,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->initial_states_and_jit_key_workspace = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->initial_states_and_jit_key_workspace,
+        num_aes_inputs * AES_STATE_BITS, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->main_bitsliced_states_buffer = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->main_bitsliced_states_buffer, num_aes_inputs * AES_STATE_BITS,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->tmp_tiled_key_buffer = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->tmp_tiled_key_buffer,
+        num_aes_inputs * AES_STATE_BITS, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->batch_processing_buffer = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->batch_processing_buffer,
+        num_aes_inputs * SBOX_MAX_AND_GATES * BATCH_BUFFER_OPERANDS *
+            sbox_parallelism,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+  }
+
+  void release(CudaStreams streams, bool allocate_gpu_memory) {
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->sbox_internal_workspace,
+                                   allocate_gpu_memory);
+    delete this->sbox_internal_workspace;
+    this->sbox_internal_workspace = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->initial_states_and_jit_key_workspace,
+                                   allocate_gpu_memory);
+    delete this->initial_states_and_jit_key_workspace;
+    this->initial_states_and_jit_key_workspace = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->main_bitsliced_states_buffer,
+                                   allocate_gpu_memory);
+    delete this->main_bitsliced_states_buffer;
+    this->main_bitsliced_states_buffer = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->tmp_tiled_key_buffer,
+                                   allocate_gpu_memory);
+    delete this->tmp_tiled_key_buffer;
+    this->tmp_tiled_key_buffer = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->batch_processing_buffer,
+                                   allocate_gpu_memory);
+    delete this->batch_processing_buffer;
+    this->batch_processing_buffer = nullptr;
+  }
+};
+
+/**
+ * This structure acts as a container, holding instances of all the other buffer
+ * management structs. It provides a
+ * single object to manage the entire lifecycle of memory needed for a complete
+ * AES-CTR encryption operation.
+ */
+template <typename Torus> struct int_aes_encrypt_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t num_aes_inputs;
+  uint32_t sbox_parallel_instances;
+
+  int_aes_lut_buffers<Torus> *luts;
+  int_aes_round_workspaces<Torus> *round_workspaces;
+  int_aes_counter_workspaces<Torus> *counter_workspaces;
+  int_aes_main_workspaces<Torus> *main_workspaces;
+
+  int_aes_encrypt_buffer(CudaStreams streams, const int_radix_params &params,
+                         bool allocate_gpu_memory, uint32_t num_aes_inputs,
+                         uint32_t sbox_parallelism, uint64_t &size_tracker) {
+
+    PANIC_IF_FALSE(num_aes_inputs >= 1,
+                   "num_aes_inputs should be greater or equal to 1");
+
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->num_aes_inputs = num_aes_inputs;
+    this->sbox_parallel_instances = sbox_parallelism;
+
+    this->luts = new int_aes_lut_buffers<Torus>(
+        streams, params, allocate_gpu_memory, num_aes_inputs, sbox_parallelism,
+        size_tracker);
+
+    this->round_workspaces = new int_aes_round_workspaces<Torus>(
+        streams, params, allocate_gpu_memory, num_aes_inputs, size_tracker);
+
+    this->counter_workspaces = new int_aes_counter_workspaces<Torus>(
+        streams, params, allocate_gpu_memory, num_aes_inputs, size_tracker);
+
+    this->main_workspaces = new int_aes_main_workspaces<Torus>(
+        streams, params, allocate_gpu_memory, num_aes_inputs, sbox_parallelism,
+        size_tracker);
+  }
+
+  void release(CudaStreams streams) {
+    luts->release(streams);
+    delete luts;
+    luts = nullptr;
+
+    round_workspaces->release(streams, allocate_gpu_memory);
+    delete round_workspaces;
+    round_workspaces = nullptr;
+
+    counter_workspaces->release(streams, allocate_gpu_memory);
+    delete counter_workspaces;
+    counter_workspaces = nullptr;
+
+    main_workspaces->release(streams, allocate_gpu_memory);
+    delete main_workspaces;
+    main_workspaces = nullptr;
+  }
+};
+
+/**
+ * This structure holds the buffer for the 44 words of the expanded key
+ * and temporary storage for word manipulations.
+ * It contains its own instance of `int_aes_encrypt_buffer` because the
+ * key expansion algorithm itself requires using the S-Box.
+ * This separation ensures that memory for key expansion can be allocated and
+ * freed independently of the main encryption process.
+ */
+template <typename Torus> struct int_key_expansion_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+
+  CudaRadixCiphertextFFI *words_buffer;
+
+  CudaRadixCiphertextFFI *tmp_word_buffer;
+  CudaRadixCiphertextFFI *tmp_rotated_word_buffer;
+
+  int_aes_encrypt_buffer<Torus> *aes_encrypt_buffer;
+
+  int_key_expansion_buffer(CudaStreams streams, const int_radix_params &params,
+                           bool allocate_gpu_memory, uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+
+    constexpr uint32_t TOTAL_WORDS = 44;
+    constexpr uint32_t BITS_PER_WORD = 32;
+    constexpr uint32_t TOTAL_BITS = TOTAL_WORDS * BITS_PER_WORD;
+
+    this->words_buffer = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->words_buffer, TOTAL_BITS,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->tmp_word_buffer = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->tmp_word_buffer,
+        BITS_PER_WORD, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->tmp_rotated_word_buffer = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->tmp_rotated_word_buffer,
+        BITS_PER_WORD, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->aes_encrypt_buffer = new int_aes_encrypt_buffer<Torus>(
+        streams, params, allocate_gpu_memory, 1, 4, size_tracker);
+  }
+
+  void release(CudaStreams streams) {
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->words_buffer, allocate_gpu_memory);
+    delete this->words_buffer;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->tmp_word_buffer, allocate_gpu_memory);
+    delete this->tmp_word_buffer;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->tmp_rotated_word_buffer,
+                                   allocate_gpu_memory);
+    delete this->tmp_rotated_word_buffer;
+
+    this->aes_encrypt_buffer->release(streams);
+    delete this->aes_encrypt_buffer;
+  }
+};
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/aes/aes.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/aes/aes.cu
@@ -0,0 +1,88 @@
+#include "../../include/aes/aes.h"
+#include "aes.cuh"
+
+uint64_t scratch_cuda_integer_aes_encrypt_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
+    uint32_t sbox_parallelism) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus,
+                          noise_reduction_type);
+
+  return scratch_cuda_integer_aes_encrypt<uint64_t>(
+      CudaStreams(streams), (int_aes_encrypt_buffer<uint64_t> **)mem_ptr,
+      params, allocate_gpu_memory, num_aes_inputs, sbox_parallelism);
+}
+
+void cuda_integer_aes_ctr_encrypt_64(CudaStreamsFFI streams,
+                                     CudaRadixCiphertextFFI *output,
+                                     CudaRadixCiphertextFFI const *iv,
+                                     CudaRadixCiphertextFFI const *round_keys,
+                                     const uint64_t *counter_bits_le_all_blocks,
+                                     uint32_t num_aes_inputs, int8_t *mem_ptr,
+                                     void *const *bsks, void *const *ksks) {
+
+  host_integer_aes_ctr_encrypt<uint64_t>(
+      CudaStreams(streams), output, iv, round_keys, counter_bits_le_all_blocks,
+      num_aes_inputs, (int_aes_encrypt_buffer<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)ksks);
+}
+
+void cleanup_cuda_integer_aes_encrypt_64(CudaStreamsFFI streams,
+                                         int8_t **mem_ptr_void) {
+
+  int_aes_encrypt_buffer<uint64_t> *mem_ptr =
+      (int_aes_encrypt_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(CudaStreams(streams));
+
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
+
+uint64_t scratch_cuda_integer_key_expansion_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus,
+                          noise_reduction_type);
+
+  return scratch_cuda_integer_key_expansion<uint64_t>(
+      CudaStreams(streams), (int_key_expansion_buffer<uint64_t> **)mem_ptr,
+      params, allocate_gpu_memory);
+}
+
+void cuda_integer_key_expansion_64(CudaStreamsFFI streams,
+                                   CudaRadixCiphertextFFI *expanded_keys,
+                                   CudaRadixCiphertextFFI const *key,
+                                   int8_t *mem_ptr, void *const *bsks,
+                                   void *const *ksks) {
+
+  host_integer_key_expansion<uint64_t>(
+      CudaStreams(streams), expanded_keys, key,
+      (int_key_expansion_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks);
+}
+
+void cleanup_cuda_integer_key_expansion_64(CudaStreamsFFI streams,
+                                           int8_t **mem_ptr_void) {
+  int_key_expansion_buffer<uint64_t> *mem_ptr =
+      (int_key_expansion_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(CudaStreams(streams));
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
--- a/backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1729,6 +1729,78 @@ unsafe extern "C" {
        mem_ptr_void: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_integer_aes_encrypt_64(
+        streams: CudaStreamsFFI,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+        num_aes_inputs: u32,
+        sbox_parallelism: u32,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_integer_aes_ctr_encrypt_64(
+        streams: CudaStreamsFFI,
+        output: *mut CudaRadixCiphertextFFI,
+        iv: *const CudaRadixCiphertextFFI,
+        round_keys: *const CudaRadixCiphertextFFI,
+        counter_bits_le_all_blocks: *const u64,
+        num_aes_inputs: u32,
+        mem_ptr: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_integer_aes_encrypt_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
+}
+unsafe extern "C" {
+    pub fn scratch_cuda_integer_key_expansion_64(
+        streams: CudaStreamsFFI,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_integer_key_expansion_64(
+        streams: CudaStreamsFFI,
+        expanded_keys: *mut CudaRadixCiphertextFFI,
+        key: *const CudaRadixCiphertextFFI,
+        mem_ptr: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_integer_key_expansion_64(
+        streams: CudaStreamsFFI,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
 pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
 pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
 pub type KS_TYPE = ffi::c_uint;
--- a/backends/tfhe-cuda-backend/wrapper.h
+++ b/backends/tfhe-cuda-backend/wrapper.h
@@ -2,6 +2,7 @@
 #include "cuda/include/ciphertext.h"
 #include "cuda/include/integer/compression/compression.h"
 #include "cuda/include/integer/integer.h"
+#include "cuda/include/aes/aes.h"
 #include "cuda/include/zk/zk.h"
 #include "cuda/include/keyswitch/keyswitch.h"
 #include "cuda/include/keyswitch/ks_enums.h"
--- a/tfhe-benchmark/benches/integer/aes.rs
+++ b/tfhe-benchmark/benches/integer/aes.rs
@@ -0,0 +1,154 @@
+#[cfg(feature = "gpu")]
+pub mod cuda {
+    use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    use benchmark::utilities::{get_bench_type, write_to_json, BenchmarkType, OperatorType};
+    use criterion::{black_box, Criterion, Throughput};
+    use tfhe::core_crypto::gpu::CudaStreams;
+    use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
+    use tfhe::integer::gpu::CudaServerKey;
+    use tfhe::integer::keycache::KEY_CACHE;
+    use tfhe::integer::{IntegerKeyKind, RadixClientKey};
+    use tfhe::keycache::NamedParam;
+    use tfhe::shortint::AtomicPatternParameters;
+
+    pub fn cuda_aes(c: &mut Criterion) {
+        let bench_name = "integer::cuda::aes";
+
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(60))
+            .warm_up_time(std::time::Duration::from_secs(60));
+
+        let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+        let atomic_param: AtomicPatternParameters = param.into();
+
+        let key: u128 = 0x2b7e151628aed2a6abf7158809cf4f3c;
+        let iv: u128 = 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff;
+        let aes_op_bit_size = 128;
+
+        let param_name = param.name();
+
+        match get_bench_type() {
+            BenchmarkType::Latency => {
+                let streams = CudaStreams::new_multi_gpu();
+                let (cpu_cks, _) = KEY_CACHE.get_from_params(atomic_param, IntegerKeyKind::Radix);
+                let sks = CudaServerKey::new(&cpu_cks, &streams);
+                let cks = RadixClientKey::from((cpu_cks, 1));
+
+                let ct_key = cks.encrypt_u128_for_aes_ctr(key);
+                let ct_iv = cks.encrypt_u128_for_aes_ctr(iv);
+
+                let d_key = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_key, &streams);
+                let d_iv = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_iv, &streams);
+
+                {
+                    const NUM_AES_INPUTS: usize = 1;
+                    const SBOX_PARALLELISM: usize = 16;
+                    let bench_id = format!("{param_name}::{NUM_AES_INPUTS}_input_encryption");
+
+                    let round_keys = unsafe { sks.key_expansion_async(&d_key, &streams) };
+                    streams.synchronize();
+
+                    bench_group.bench_function(&bench_id, |b| {
+                        b.iter(|| {
+                            unsafe {
+                                black_box(sks.aes_encrypt_async(
+                                    &d_iv,
+                                    &round_keys,
+                                    0,
+                                    NUM_AES_INPUTS,
+                                    SBOX_PARALLELISM,
+                                    &streams,
+                                ));
+                            }
+                            streams.synchronize();
+                        })
+                    });
+
+                    write_to_json::<u64, _>(
+                        &bench_id,
+                        atomic_param,
+                        param.name(),
+                        "aes_encryption",
+                        &OperatorType::Atomic,
+                        aes_op_bit_size,
+                        vec![atomic_param.message_modulus().0.ilog2(); aes_op_bit_size as usize],
+                    );
+                }
+
+                {
+                    let bench_id = format!("{param_name}::key_expansion");
+
+                    bench_group.bench_function(&bench_id, |b| {
+                        b.iter(|| {
+                            unsafe {
+                                black_box(sks.key_expansion_async(&d_key, &streams));
+                            }
+                            streams.synchronize();
+                        })
+                    });
+
+                    write_to_json::<u64, _>(
+                        &bench_id,
+                        atomic_param,
+                        param.name(),
+                        "aes_key_expansion",
+                        &OperatorType::Atomic,
+                        aes_op_bit_size,
+                        vec![atomic_param.message_modulus().0.ilog2(); aes_op_bit_size as usize],
+                    );
+                }
+            }
+            BenchmarkType::Throughput => {
+                const NUM_AES_INPUTS: usize = 192;
+                const SBOX_PARALLELISM: usize = 16;
+                let bench_id = format!("throughput::{param_name}::{NUM_AES_INPUTS}_inputs");
+
+                let streams = CudaStreams::new_multi_gpu();
+                let (cpu_cks, _) = KEY_CACHE.get_from_params(atomic_param, IntegerKeyKind::Radix);
+                let sks = CudaServerKey::new(&cpu_cks, &streams);
+                let cks = RadixClientKey::from((cpu_cks, 1));
+
+                bench_group.throughput(Throughput::Elements(NUM_AES_INPUTS as u64));
+
+                let ct_key = cks.encrypt_u128_for_aes_ctr(key);
+                let ct_iv = cks.encrypt_u128_for_aes_ctr(iv);
+
+                let d_key = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_key, &streams);
+                let d_iv = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_iv, &streams);
+
+                let round_keys = unsafe { sks.key_expansion_async(&d_key, &streams) };
+                streams.synchronize();
+
+                bench_group.bench_function(&bench_id, |b| {
+                    b.iter(|| {
+                        unsafe {
+                            black_box(sks.aes_encrypt_async(
+                                &d_iv,
+                                &round_keys,
+                                0,
+                                NUM_AES_INPUTS,
+                                SBOX_PARALLELISM,
+                                &streams,
+                            ));
+                        }
+                        streams.synchronize();
+                    })
+                });
+
+                write_to_json::<u64, _>(
+                    &bench_id,
+                    atomic_param,
+                    param.name(),
+                    "aes_encryption",
+                    &OperatorType::Atomic,
+                    aes_op_bit_size,
+                    vec![atomic_param.message_modulus().0.ilog2(); aes_op_bit_size as usize],
+                );
+            }
+        };
+
+        bench_group.finish();
+    }
+}
--- a/tfhe-benchmark/benches/integer/bench.rs
+++ b/tfhe-benchmark/benches/integer/bench.rs
@@ -1,5 +1,6 @@
 #![allow(dead_code)]

+mod aes;
 mod oprf;

 use benchmark::params::ParamsAndNumBlocksIter;
@@ -2795,6 +2796,7 @@ mod cuda {
        cuda_trailing_ones,
        cuda_ilog2,
        oprf::cuda::cuda_unsigned_oprf,
+        aes::cuda::cuda_aes,
    );

    criterion_group!(
@@ -2823,6 +2825,7 @@ mod cuda {
        cuda_scalar_div,
        cuda_scalar_rem,
        oprf::cuda::cuda_unsigned_oprf,
+        aes::cuda::cuda_aes,
    );

    criterion_group!(
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -7403,3 +7403,289 @@ pub unsafe fn expand_async<T: UnsignedInteger, B: Numeric>(
    );
    cleanup_expand_without_verification_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
 }
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async<
+    T: UnsignedInteger,
+    B: Numeric,
+>(
+    streams: &CudaStreams,
+    output: &mut CudaRadixCiphertext,
+    iv: &CudaRadixCiphertext,
+    round_keys: &CudaRadixCiphertext,
+    start_counter: u128,
+    num_aes_inputs: u32,
+    sbox_parallelism: u32,
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    grouping_factor: LweBskGroupingFactor,
+    pbs_type: PBSType,
+    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
+) {
+    let mut output_degrees = output.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut output_noise_levels = output.info.blocks.iter().map(|b| b.noise_level.0).collect();
+    let mut cuda_ffi_output =
+        prepare_cuda_radix_ffi(output, &mut output_degrees, &mut output_noise_levels);
+
+    let mut iv_degrees = iv.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut iv_noise_levels = iv.info.blocks.iter().map(|b| b.noise_level.0).collect();
+    let cuda_ffi_iv = prepare_cuda_radix_ffi(iv, &mut iv_degrees, &mut iv_noise_levels);
+
+    let mut round_keys_degrees = round_keys.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut round_keys_noise_levels = round_keys
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let cuda_ffi_round_keys = prepare_cuda_radix_ffi(
+        round_keys,
+        &mut round_keys_degrees,
+        &mut round_keys_noise_levels,
+    );
+
+    let noise_reduction_type = resolve_noise_reduction_type(ms_noise_reduction_configuration);
+
+    let counter_bits_le: Vec<u64> = (0..num_aes_inputs)
+        .flat_map(|i| {
+            let current_counter = start_counter + i as u128;
+            (0..128).map(move |bit_index| ((current_counter >> bit_index) & 1) as u64)
+        })
+        .collect();
+
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    scratch_cuda_integer_aes_encrypt_64(
+        streams.ffi(),
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+        noise_reduction_type as u32,
+        num_aes_inputs,
+        sbox_parallelism,
+    );
+
+    cuda_integer_aes_ctr_encrypt_64(
+        streams.ffi(),
+        &raw mut cuda_ffi_output,
+        &raw const cuda_ffi_iv,
+        &raw const cuda_ffi_round_keys,
+        counter_bits_le.as_ptr(),
+        num_aes_inputs,
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+    );
+
+    cleanup_cuda_integer_aes_encrypt_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
+
+    update_noise_degree(output, &cuda_ffi_output);
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn get_aes_ctr_encrypt_integer_radix_size_on_gpu(
+    streams: &CudaStreams,
+    num_aes_inputs: u32,
+    sbox_parallelism: u32,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    grouping_factor: LweBskGroupingFactor,
+    pbs_type: PBSType,
+    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
+) -> u64 {
+    let noise_reduction_type = resolve_noise_reduction_type(ms_noise_reduction_configuration);
+
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let size = unsafe {
+        scratch_cuda_integer_aes_encrypt_64(
+            streams.ffi(),
+            std::ptr::addr_of_mut!(mem_ptr),
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            lwe_dimension.0 as u32,
+            ks_level.0 as u32,
+            ks_base_log.0 as u32,
+            pbs_level.0 as u32,
+            pbs_base_log.0 as u32,
+            grouping_factor.0 as u32,
+            message_modulus.0 as u32,
+            carry_modulus.0 as u32,
+            pbs_type as u32,
+            false,
+            noise_reduction_type as u32,
+            num_aes_inputs,
+            sbox_parallelism,
+        )
+    };
+
+    unsafe { cleanup_cuda_integer_aes_encrypt_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)) };
+
+    size
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn unchecked_key_expansion_integer_radix_kb_assign_async<
+    T: UnsignedInteger,
+    B: Numeric,
+>(
+    streams: &CudaStreams,
+    expanded_keys: &mut CudaRadixCiphertext,
+    key: &CudaRadixCiphertext,
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    grouping_factor: LweBskGroupingFactor,
+    pbs_type: PBSType,
+    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
+) {
+    let mut expanded_keys_degrees = expanded_keys
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.degree.0)
+        .collect();
+    let mut expanded_keys_noise_levels = expanded_keys
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut cuda_ffi_expanded_keys = prepare_cuda_radix_ffi(
+        expanded_keys,
+        &mut expanded_keys_degrees,
+        &mut expanded_keys_noise_levels,
+    );
+
+    let mut key_degrees = key.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut key_noise_levels = key.info.blocks.iter().map(|b| b.noise_level.0).collect();
+    let cuda_ffi_key = prepare_cuda_radix_ffi(key, &mut key_degrees, &mut key_noise_levels);
+
+    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
+
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    scratch_cuda_integer_key_expansion_64(
+        streams.ffi(),
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+        noise_reduction_type as u32,
+    );
+
+    cuda_integer_key_expansion_64(
+        streams.ffi(),
+        &raw mut cuda_ffi_expanded_keys,
+        &raw const cuda_ffi_key,
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+    );
+
+    cleanup_cuda_integer_key_expansion_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
+
+    update_noise_degree(expanded_keys, &cuda_ffi_expanded_keys);
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn get_key_expansion_integer_radix_size_on_gpu(
+    streams: &CudaStreams,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    grouping_factor: LweBskGroupingFactor,
+    pbs_type: PBSType,
+    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
+) -> u64 {
+    let noise_reduction_type = resolve_noise_reduction_type(ms_noise_reduction_configuration);
+
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let size = {
+        scratch_cuda_integer_key_expansion_64(
+            streams.ffi(),
+            std::ptr::addr_of_mut!(mem_ptr),
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            lwe_dimension.0 as u32,
+            ks_level.0 as u32,
+            ks_base_log.0 as u32,
+            pbs_level.0 as u32,
+            pbs_base_log.0 as u32,
+            grouping_factor.0 as u32,
+            message_modulus.0 as u32,
+            carry_modulus.0 as u32,
+            pbs_type as u32,
+            true,
+            noise_reduction_type as u32,
+        )
+    };
+
+    unsafe {
+        cleanup_cuda_integer_key_expansion_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr))
+    };
+
+    size
+}
--- a/tfhe/src/integer/gpu/server_key/radix/aes.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/aes.rs
@@ -0,0 +1,465 @@
+use crate::core_crypto::gpu::{
+    check_valid_cuda_malloc, check_valid_cuda_malloc_assert_oom, CudaStreams,
+};
+use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
+use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
+
+use crate::core_crypto::prelude::LweBskGroupingFactor;
+use crate::integer::gpu::{
+    get_aes_ctr_encrypt_integer_radix_size_on_gpu, get_key_expansion_integer_radix_size_on_gpu,
+    unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async,
+    unchecked_key_expansion_integer_radix_kb_assign_async, PBSType,
+};
+use crate::integer::{RadixCiphertext, RadixClientKey};
+use crate::shortint::Ciphertext;
+
+const NUM_BITS: usize = 128;
+
+impl RadixClientKey {
+    /// Encrypts a 128-bit block for homomorphic AES evaluation.
+    ///
+    /// This function prepares a 128-bit plaintext block (like an AES key or IV)
+    /// for homomorphic processing by decomposing it into its 128 constituent bits
+    /// and encrypting each bit individually with FHE.
+    ///
+    /// The process is as follows:
+    /// ```text
+    /// // INPUT: A 128-bit plaintext block
+    /// Plaintext block (u128): 0x2b7e1516...
+    ///       |
+    ///       V
+    /// // 1. Decompose the block into individual bits
+    /// Individual bits: [b127, b126, ..., b1, b0]
+    ///       |
+    ///       V
+    /// // 2. Encrypt each bit individually using FHE
+    /// `self.encrypt(bit)` is applied to each bit
+    ///       |
+    ///       V
+    /// // 3. Collect the resulting bit-ciphertexts
+    /// Ciphertexts: [Ct(b127), Ct(b126), ..., Ct(b0)]
+    ///       |
+    ///       V
+    /// // 4. Group the bit-ciphertexts into a single RadixCiphertext
+    /// //    representing the full encrypted block.
+    /// // OUTPUT: A RadixCiphertext
+    /// ```
+    pub fn encrypt_u128_for_aes_ctr(&self, data: u128) -> RadixCiphertext {
+        let mut blocks: Vec<Ciphertext> = Vec::with_capacity(NUM_BITS);
+        for i in 0..NUM_BITS {
+            let bit = ((data >> (NUM_BITS - 1 - i)) & 1) as u64;
+            blocks.extend(self.encrypt(bit).blocks);
+        }
+        RadixCiphertext::from(blocks)
+    }
+
+    /// Decrypts a `RadixCiphertext` containing one or more 128-bit blocks
+    /// that were homomorphically processed.
+    ///
+    /// This function reverses the encryption process by decrypting each individual
+    /// bit-ciphertext and reassembling them into 128-bit plaintext blocks.
+    ///
+    /// The process is as follows:
+    /// ```text
+    /// // INPUT: RadixCiphertext containing one or more encrypted blocks
+    /// Ciphertext collection: [Ct(b127), ..., Ct(b0), Ct(b'127), ..., Ct(b'0), ...]
+    ///       |
+    ///       | (For each sequence of 128 bit-ciphertexts)
+    ///       V
+    /// // 1. Decrypt each bit's ciphertext individually
+    /// `self.decrypt(Ct)` is applied to each bit-ciphertext
+    ///       |
+    ///       V
+    /// // 2. Collect the resulting plaintext bits
+    /// Plaintext bits: [b127, b126, ..., b0]
+    ///       |
+    ///       V
+    /// // 3. Assemble the bits back into a 128-bit block
+    /// Reconstruction: ( ...((b127 << 1) | b126) << 1 | ... ) | b0
+    ///       |
+    ///       V
+    /// // OUTPUT: A vector of plaintext u128 blocks
+    /// Plaintext u128s: [0x..., ...]
+    /// ```
+    pub fn decrypt_u128_from_aes_ctr(
+        &self,
+        encrypted_result: &RadixCiphertext,
+        num_aes_inputs: usize,
+    ) -> Vec<u128> {
+        let mut plaintext_results = Vec::with_capacity(num_aes_inputs);
+        for i in 0..num_aes_inputs {
+            let mut current_block_plaintext: u128 = 0;
+            let block_start_index = i * NUM_BITS;
+            for j in 0..NUM_BITS {
+                let block_slice =
+                    &encrypted_result.blocks[block_start_index + j..block_start_index + j + 1];
+                let block_radix_ct = RadixCiphertext::from(block_slice.to_vec());
+                let decrypted_bit: u128 = self.decrypt(&block_radix_ct);
+                current_block_plaintext = (current_block_plaintext << 1) | decrypted_bit;
+            }
+            plaintext_results.push(current_block_plaintext);
+        }
+        plaintext_results
+    }
+}
+
+impl CudaServerKey {
+    pub fn aes_ctr(
+        &self,
+        key: &CudaUnsignedRadixCiphertext,
+        iv: &CudaUnsignedRadixCiphertext,
+        start_counter: u128,
+        num_aes_inputs: usize,
+        streams: &CudaStreams,
+    ) -> CudaUnsignedRadixCiphertext {
+        let gpu_index = streams.gpu_indexes[0];
+
+        let key_expansion_size = self.get_key_expansion_size_on_gpu(streams);
+        check_valid_cuda_malloc_assert_oom(key_expansion_size, gpu_index);
+
+        // `parallelism` refers to level of parallelization of the S-box.
+        // S-box should process 16 bytes of data: sequentially, or in groups of 2,
+        // or in groups of 4, or in groups of 8, or all 16 at the same time.
+        // More parallelization leads to higher memory usage. Therefore, we must find a way
+        // to maximize parallelization while ensuring that there is still enough memory remaining on
+        // the GPU.
+        //
+        let mut parallelism = 16;
+
+        while parallelism > 0 {
+            // `num_aes_inputs` refers to the number of 128-bit ciphertexts that AES will produce.
+            //
+            let aes_encrypt_size =
+                self.get_aes_encrypt_size_on_gpu(num_aes_inputs, parallelism, streams);
+
+            if check_valid_cuda_malloc(aes_encrypt_size, streams.gpu_indexes[0]) {
+                let round_keys = unsafe { self.key_expansion_async(key, streams) };
+                let res = unsafe {
+                    self.aes_encrypt_async(
+                        iv,
+                        &round_keys,
+                        start_counter,
+                        num_aes_inputs,
+                        parallelism,
+                        streams,
+                    )
+                };
+                streams.synchronize();
+                return res;
+            }
+            parallelism /= 2;
+        }
+
+        panic!("Failed to allocate GPU memory for AES, even with the lowest parallelism setting.");
+    }
+
+    pub fn aes_ctr_with_fixed_parallelism(
+        &self,
+        key: &CudaUnsignedRadixCiphertext,
+        iv: &CudaUnsignedRadixCiphertext,
+        start_counter: u128,
+        num_aes_inputs: usize,
+        sbox_parallelism: usize,
+        streams: &CudaStreams,
+    ) -> CudaUnsignedRadixCiphertext {
+        assert!(
+            [1, 2, 4, 8, 16].contains(&sbox_parallelism),
+            "Invalid S-Box parallelism: must be one of [1, 2, 4, 8, 16], got {sbox_parallelism}"
+        );
+
+        let gpu_index = streams.gpu_indexes[0];
+
+        let key_expansion_size = self.get_key_expansion_size_on_gpu(streams);
+        check_valid_cuda_malloc_assert_oom(key_expansion_size, gpu_index);
+
+        let aes_encrypt_size =
+            self.get_aes_encrypt_size_on_gpu(num_aes_inputs, sbox_parallelism, streams);
+        check_valid_cuda_malloc_assert_oom(aes_encrypt_size, gpu_index);
+
+        let round_keys = unsafe { self.key_expansion_async(key, streams) };
+        let res = unsafe {
+            self.aes_encrypt_async(
+                iv,
+                &round_keys,
+                start_counter,
+                num_aes_inputs,
+                sbox_parallelism,
+                streams,
+            )
+        };
+        streams.synchronize();
+        res
+    }
+
+    /// # Safety
+    ///
+    /// - [CudaStreams::synchronize] __must__ be called after this function as soon as
+    ///   synchronization is required
+    pub unsafe fn aes_encrypt_async(
+        &self,
+        iv: &CudaUnsignedRadixCiphertext,
+        round_keys: &CudaUnsignedRadixCiphertext,
+        start_counter: u128,
+        num_aes_inputs: usize,
+        sbox_parallelism: usize,
+        streams: &CudaStreams,
+    ) -> CudaUnsignedRadixCiphertext {
+        let mut result: CudaUnsignedRadixCiphertext =
+            self.create_trivial_zero_radix(num_aes_inputs * 128, streams);
+
+        let num_round_key_blocks = 11 * NUM_BITS;
+
+        assert_eq!(
+            iv.as_ref().d_blocks.lwe_ciphertext_count().0,
+            NUM_BITS,
+            "AES IV must contain {NUM_BITS} encrypted bits, but contains {}",
+            iv.as_ref().d_blocks.lwe_ciphertext_count().0
+        );
+        assert_eq!(
+            round_keys.as_ref().d_blocks.lwe_ciphertext_count().0,
+            num_round_key_blocks,
+            "AES round_keys must contain {num_round_key_blocks} encrypted bits, but contains {}",
+            round_keys.as_ref().d_blocks.lwe_ciphertext_count().0
+        );
+        assert_eq!(
+            result.as_ref().d_blocks.lwe_ciphertext_count().0,
+            num_aes_inputs * 128,
+            "AES result must contain {} encrypted bits for {num_aes_inputs} blocks, but contains {}",
+            num_aes_inputs * 128,
+            result.as_ref().d_blocks.lwe_ciphertext_count().0
+        );
+
+        match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async(
+                    streams,
+                    result.as_mut(),
+                    iv.as_ref(),
+                    round_keys.as_ref(),
+                    start_counter,
+                    num_aes_inputs as u32,
+                    sbox_parallelism as u32,
+                    &d_bsk.d_vec,
+                    &self.key_switching_key.d_vec,
+                    self.message_modulus,
+                    self.carry_modulus,
+                    d_bsk.glwe_dimension,
+                    d_bsk.polynomial_size,
+                    d_bsk.input_lwe_dimension,
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count,
+                    d_bsk.decomp_base_log,
+                    LweBskGroupingFactor(0),
+                    PBSType::Classical,
+                    d_bsk.ms_noise_reduction_configuration.as_ref(),
+                );
+            }
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async(
+                    streams,
+                    result.as_mut(),
+                    iv.as_ref(),
+                    round_keys.as_ref(),
+                    start_counter,
+                    num_aes_inputs as u32,
+                    sbox_parallelism as u32,
+                    &d_multibit_bsk.d_vec,
+                    &self.key_switching_key.d_vec,
+                    self.message_modulus,
+                    self.carry_modulus,
+                    d_multibit_bsk.glwe_dimension,
+                    d_multibit_bsk.polynomial_size,
+                    d_multibit_bsk.input_lwe_dimension,
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_multibit_bsk.decomp_level_count,
+                    d_multibit_bsk.decomp_base_log,
+                    d_multibit_bsk.grouping_factor,
+                    PBSType::MultiBit,
+                    None,
+                );
+            }
+        }
+        result
+    }
+
+    fn get_aes_encrypt_size_on_gpu(
+        &self,
+        num_aes_inputs: usize,
+        sbox_parallelism: usize,
+        streams: &CudaStreams,
+    ) -> u64 {
+        let size = unsafe {
+            self.get_aes_encrypt_size_on_gpu_async(num_aes_inputs, sbox_parallelism, streams)
+        };
+        streams.synchronize();
+        size
+    }
+
+    /// # Safety
+    ///
+    /// - [CudaStreams::synchronize] __must__ be called after this function as soon as
+    ///   synchronization is required
+    unsafe fn get_aes_encrypt_size_on_gpu_async(
+        &self,
+        num_aes_inputs: usize,
+        sbox_parallelism: usize,
+        streams: &CudaStreams,
+    ) -> u64 {
+        match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => get_aes_ctr_encrypt_integer_radix_size_on_gpu(
+                streams,
+                num_aes_inputs as u32,
+                sbox_parallelism as u32,
+                self.message_modulus,
+                self.carry_modulus,
+                d_bsk.glwe_dimension,
+                d_bsk.polynomial_size,
+                d_bsk.input_lwe_dimension,
+                self.key_switching_key.decomposition_level_count(),
+                self.key_switching_key.decomposition_base_log(),
+                d_bsk.decomp_level_count,
+                d_bsk.decomp_base_log,
+                LweBskGroupingFactor(0),
+                PBSType::Classical,
+                d_bsk.ms_noise_reduction_configuration.as_ref(),
+            ),
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                get_aes_ctr_encrypt_integer_radix_size_on_gpu(
+                    streams,
+                    num_aes_inputs as u32,
+                    sbox_parallelism as u32,
+                    self.message_modulus,
+                    self.carry_modulus,
+                    d_multibit_bsk.glwe_dimension,
+                    d_multibit_bsk.polynomial_size,
+                    d_multibit_bsk.input_lwe_dimension,
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_multibit_bsk.decomp_level_count,
+                    d_multibit_bsk.decomp_base_log,
+                    d_multibit_bsk.grouping_factor,
+                    PBSType::MultiBit,
+                    None,
+                )
+            }
+        }
+    }
+
+    /// # Safety
+    ///
+    /// - [CudaStreams::synchronize] __must__ be called after this function as soon as
+    ///   synchronization is required
+    pub unsafe fn key_expansion_async(
+        &self,
+        key: &CudaUnsignedRadixCiphertext,
+        streams: &CudaStreams,
+    ) -> CudaUnsignedRadixCiphertext {
+        let num_round_keys = 11;
+        let num_key_bits = 128;
+        let mut expanded_keys: CudaUnsignedRadixCiphertext =
+            self.create_trivial_zero_radix(num_round_keys * num_key_bits, streams);
+
+        assert_eq!(
+            key.as_ref().d_blocks.lwe_ciphertext_count().0,
+            num_key_bits,
+            "Input key must contain {} encrypted bits, but contains {}",
+            num_key_bits,
+            key.as_ref().d_blocks.lwe_ciphertext_count().0
+        );
+
+        match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                unchecked_key_expansion_integer_radix_kb_assign_async(
+                    streams,
+                    expanded_keys.as_mut(),
+                    key.as_ref(),
+                    &d_bsk.d_vec,
+                    &self.key_switching_key.d_vec,
+                    self.message_modulus,
+                    self.carry_modulus,
+                    d_bsk.glwe_dimension,
+                    d_bsk.polynomial_size,
+                    d_bsk.input_lwe_dimension,
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count,
+                    d_bsk.decomp_base_log,
+                    LweBskGroupingFactor(0),
+                    PBSType::Classical,
+                    d_bsk.ms_noise_reduction_configuration.as_ref(),
+                );
+            }
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                unchecked_key_expansion_integer_radix_kb_assign_async(
+                    streams,
+                    expanded_keys.as_mut(),
+                    key.as_ref(),
+                    &d_multibit_bsk.d_vec,
+                    &self.key_switching_key.d_vec,
+                    self.message_modulus,
+                    self.carry_modulus,
+                    d_multibit_bsk.glwe_dimension,
+                    d_multibit_bsk.polynomial_size,
+                    d_multibit_bsk.input_lwe_dimension,
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_multibit_bsk.decomp_level_count,
+                    d_multibit_bsk.decomp_base_log,
+                    d_multibit_bsk.grouping_factor,
+                    PBSType::MultiBit,
+                    None,
+                );
+            }
+        }
+        expanded_keys
+    }
+
+    fn get_key_expansion_size_on_gpu(&self, streams: &CudaStreams) -> u64 {
+        let size = unsafe { self.get_key_expansion_size_on_gpu_async(streams) };
+        streams.synchronize();
+        size
+    }
+
+    /// # Safety
+    ///
+    /// - [CudaStreams::synchronize] __must__ be called after this function as soon as
+    ///   synchronization is required
+    unsafe fn get_key_expansion_size_on_gpu_async(&self, streams: &CudaStreams) -> u64 {
+        match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => get_key_expansion_integer_radix_size_on_gpu(
+                streams,
+                self.message_modulus,
+                self.carry_modulus,
+                d_bsk.glwe_dimension,
+                d_bsk.polynomial_size,
+                d_bsk.input_lwe_dimension,
+                self.key_switching_key.decomposition_level_count(),
+                self.key_switching_key.decomposition_base_log(),
+                d_bsk.decomp_level_count,
+                d_bsk.decomp_base_log,
+                LweBskGroupingFactor(0),
+                PBSType::Classical,
+                d_bsk.ms_noise_reduction_configuration.as_ref(),
+            ),
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                get_key_expansion_integer_radix_size_on_gpu(
+                    streams,
+                    self.message_modulus,
+                    self.carry_modulus,
+                    d_multibit_bsk.glwe_dimension,
+                    d_multibit_bsk.polynomial_size,
+                    d_multibit_bsk.input_lwe_dimension,
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_multibit_bsk.decomp_level_count,
+                    d_multibit_bsk.decomp_base_log,
+                    d_multibit_bsk.grouping_factor,
+                    PBSType::MultiBit,
+                    None,
+                )
+            }
+        }
+    }
+}
--- a/tfhe/src/integer/gpu/server_key/radix/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -57,6 +57,7 @@ mod sub;
 mod vector_comparisons;
 mod vector_find;

+mod aes;
 #[cfg(test)]
 mod tests_long_run;
 #[cfg(test)]
--- a/tfhe/src/integer/gpu/server_key/radix/tests_unsigned/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_unsigned/mod.rs
@@ -1,4 +1,5 @@
 pub(crate) mod test_add;
+pub(crate) mod test_aes;
 pub(crate) mod test_bitwise_op;
 pub(crate) mod test_cmux;
 pub(crate) mod test_comparison;
@@ -82,6 +83,98 @@ impl<F> GpuFunctionExecutor<F> {
    }
 }

+impl<'a, F>
+    FunctionExecutor<
+        (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
+        RadixCiphertext,
+    > for GpuFunctionExecutor<F>
+where
+    F: Fn(
+        &CudaServerKey,
+        &CudaUnsignedRadixCiphertext,
+        &CudaUnsignedRadixCiphertext,
+        u128,
+        usize,
+        usize,
+        &CudaStreams,
+    ) -> CudaUnsignedRadixCiphertext,
+{
+    fn setup(&mut self, cks: &RadixClientKey, sks: Arc<ServerKey>) {
+        self.setup_from_keys(cks, &sks);
+    }
+
+    fn execute(
+        &mut self,
+        input: (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
+    ) -> RadixCiphertext {
+        let context = self
+            .context
+            .as_ref()
+            .expect("setup was not properly called");
+
+        let d_ctxt_1 =
+            CudaUnsignedRadixCiphertext::from_radix_ciphertext(input.0, &context.streams);
+        let d_ctxt_2 =
+            CudaUnsignedRadixCiphertext::from_radix_ciphertext(input.1, &context.streams);
+
+        let gpu_result = (self.func)(
+            &context.sks,
+            &d_ctxt_1,
+            &d_ctxt_2,
+            input.2,
+            input.3,
+            input.4,
+            &context.streams,
+        );
+
+        gpu_result.to_radix_ciphertext(&context.streams)
+    }
+}
+
+impl<'a, F>
+    FunctionExecutor<(&'a RadixCiphertext, &'a RadixCiphertext, u128, usize), RadixCiphertext>
+    for GpuFunctionExecutor<F>
+where
+    F: Fn(
+        &CudaServerKey,
+        &CudaUnsignedRadixCiphertext,
+        &CudaUnsignedRadixCiphertext,
+        u128,
+        usize,
+        &CudaStreams,
+    ) -> CudaUnsignedRadixCiphertext,
+{
+    fn setup(&mut self, cks: &RadixClientKey, sks: Arc<ServerKey>) {
+        self.setup_from_keys(cks, &sks);
+    }
+
+    fn execute(
+        &mut self,
+        input: (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize),
+    ) -> RadixCiphertext {
+        let context = self
+            .context
+            .as_ref()
+            .expect("setup was not properly called");
+
+        let d_ctxt_1 =
+            CudaUnsignedRadixCiphertext::from_radix_ciphertext(input.0, &context.streams);
+        let d_ctxt_2 =
+            CudaUnsignedRadixCiphertext::from_radix_ciphertext(input.1, &context.streams);
+
+        let gpu_result = (self.func)(
+            &context.sks,
+            &d_ctxt_1,
+            &d_ctxt_2,
+            input.2,
+            input.3,
+            &context.streams,
+        );
+
+        gpu_result.to_radix_ciphertext(&context.streams)
+    }
+}
+
 /// For default/unchecked binary functions
 impl<'a, F> FunctionExecutor<(&'a RadixCiphertext, &'a RadixCiphertext), RadixCiphertext>
    for GpuFunctionExecutor<F>
--- a/tfhe/src/integer/gpu/server_key/radix/tests_unsigned/test_aes.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_unsigned/test_aes.rs
@@ -0,0 +1,59 @@
+use crate::integer::gpu::server_key::radix::tests_unsigned::{
+    create_gpu_parameterized_test, GpuFunctionExecutor,
+};
+use crate::integer::gpu::CudaServerKey;
+use crate::integer::server_key::radix_parallel::tests_cases_unsigned::{
+    aes_dynamic_parallelism_many_inputs_test, aes_fixed_parallelism_1_input_test,
+    aes_fixed_parallelism_2_inputs_test,
+};
+use crate::shortint::parameters::{
+    TestParameters, PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+};
+
+create_gpu_parameterized_test!(integer_aes_fixed_parallelism_1_input {
+    PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
+});
+
+create_gpu_parameterized_test!(integer_aes_fixed_parallelism_2_inputs {
+    PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
+});
+
+create_gpu_parameterized_test!(integer_aes_dynamic_parallelism_many_inputs {
+    PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
+});
+
+// The following two tests are referred to as "fixed_parallelism" because the objective is to test
+// AES, in CTR mode, across all possible parallelizations of the S-box. The S-box must process 16
+// bytes; the parallelization refers to the number of bytes it will process in parallel in one call:
+// 1, 2, 4, 8, or 16.
+//
+fn integer_aes_fixed_parallelism_1_input<P>(param: P)
+where
+    P: Into<TestParameters>,
+{
+    let executor = GpuFunctionExecutor::new(&CudaServerKey::aes_ctr_with_fixed_parallelism);
+    aes_fixed_parallelism_1_input_test(param, executor);
+}
+
+fn integer_aes_fixed_parallelism_2_inputs<P>(param: P)
+where
+    P: Into<TestParameters>,
+{
+    let executor = GpuFunctionExecutor::new(&CudaServerKey::aes_ctr_with_fixed_parallelism);
+    aes_fixed_parallelism_2_inputs_test(param, executor);
+}
+
+// The test referred to as "dynamic_parallelism" will seek the maximum s-box parallelization that
+// the machine can support.
+//
+fn integer_aes_dynamic_parallelism_many_inputs<P>(param: P)
+where
+    P: Into<TestParameters>,
+{
+    let executor = GpuFunctionExecutor::new(&CudaServerKey::aes_ctr);
+    aes_dynamic_parallelism_many_inputs_test(param, executor);
+}
--- a/tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
+++ b/tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
@@ -43,6 +43,11 @@ pub(crate) use crate::integer::server_key::radix_parallel::tests_unsigned::test_
    default_add_test, unchecked_add_assign_test,
 };
 #[cfg(feature = "gpu")]
+pub(crate) use crate::integer::server_key::radix_parallel::tests_unsigned::test_aes::{
+    aes_dynamic_parallelism_many_inputs_test, aes_fixed_parallelism_1_input_test,
+    aes_fixed_parallelism_2_inputs_test,
+};
+#[cfg(feature = "gpu")]
 pub(crate) use crate::integer::server_key::radix_parallel::tests_unsigned::test_neg::default_neg_test;
 pub(crate) use crate::integer::server_key::radix_parallel::tests_unsigned::test_neg::unchecked_neg_test;
 #[cfg(feature = "gpu")]
--- a/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/mod.rs
+++ b/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/mod.rs
@@ -1,5 +1,6 @@
 mod modulus_switch_compression;
 pub(crate) mod test_add;
+pub(crate) mod test_aes;
 pub(crate) mod test_bitwise_op;
 mod test_block_rotate;
 mod test_block_shift;
--- a/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/test_aes.rs
+++ b/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/test_aes.rs
@@ -0,0 +1,225 @@
+#![cfg(feature = "gpu")]
+
+use crate::integer::keycache::KEY_CACHE;
+use crate::integer::server_key::radix_parallel::tests_cases_unsigned::FunctionExecutor;
+use crate::integer::{IntegerKeyKind, RadixCiphertext, RadixClientKey};
+use crate::shortint::parameters::TestParameters;
+use std::sync::Arc;
+
+const S_BOX: [u8; 256] = [
+    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
+];
+
+fn plain_key_expansion(key: u128) -> Vec<u128> {
+    const RCON: [u32; 10] = [
+        0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000,
+        0x80000000, 0x1B000000, 0x36000000,
+    ];
+    let mut words = [0u32; 44];
+    for (i, word) in words.iter_mut().enumerate().take(4) {
+        *word = (key >> (96 - (i * 32))) as u32;
+    }
+    for i in 4..44 {
+        let mut temp = words[i - 1];
+        if i % 4 == 0 {
+            temp = temp.rotate_left(8);
+            let mut sub_bytes = 0u32;
+            for j in 0..4 {
+                let byte = (temp >> (24 - j * 8)) as u8;
+                sub_bytes |= (S_BOX[byte as usize] as u32) << (24 - j * 8);
+            }
+            temp = sub_bytes ^ RCON[i / 4 - 1];
+        }
+        words[i] = words[i - 4] ^ temp;
+    }
+    words
+        .chunks_exact(4)
+        .map(|chunk| {
+            ((chunk[0] as u128) << 96)
+                | ((chunk[1] as u128) << 64)
+                | ((chunk[2] as u128) << 32)
+                | (chunk[3] as u128)
+        })
+        .collect()
+}
+fn sub_bytes(state: &mut [u8; 16]) {
+    for byte in state.iter_mut() {
+        *byte = S_BOX[*byte as usize];
+    }
+}
+fn shift_rows(state: &mut [u8; 16]) {
+    let original = *state;
+    state[1] = original[5];
+    state[5] = original[9];
+    state[9] = original[13];
+    state[13] = original[1];
+    state[2] = original[10];
+    state[6] = original[14];
+    state[10] = original[2];
+    state[14] = original[6];
+    state[3] = original[15];
+    state[7] = original[3];
+    state[11] = original[7];
+    state[15] = original[11];
+}
+fn gmul(mut a: u8, mut b: u8) -> u8 {
+    let mut p = 0;
+    for _ in 0..8 {
+        if (b & 1) != 0 {
+            p ^= a;
+        }
+        let hi_bit_set = (a & 0x80) != 0;
+        a <<= 1;
+        if hi_bit_set {
+            a ^= 0x1B;
+        }
+        b >>= 1;
+    }
+    p
+}
+fn mix_columns(state: &mut [u8; 16]) {
+    let original = *state;
+    for i in 0..4 {
+        let col = i * 4;
+        state[col] = gmul(original[col], 2)
+            ^ gmul(original[col + 1], 3)
+            ^ original[col + 2]
+            ^ original[col + 3];
+        state[col + 1] = original[col]
+            ^ gmul(original[col + 1], 2)
+            ^ gmul(original[col + 2], 3)
+            ^ original[col + 3];
+        state[col + 2] = original[col]
+            ^ original[col + 1]
+            ^ gmul(original[col + 2], 2)
+            ^ gmul(original[col + 3], 3);
+        state[col + 3] = gmul(original[col], 3)
+            ^ original[col + 1]
+            ^ original[col + 2]
+            ^ gmul(original[col + 3], 2);
+    }
+}
+fn add_round_key(state: &mut [u8; 16], round_key: u128) {
+    let key_bytes = round_key.to_be_bytes();
+    for i in 0..16 {
+        state[i] ^= key_bytes[i];
+    }
+}
+fn plain_aes_encrypt_block(block_bytes: &mut [u8; 16], expanded_keys: &[u128]) {
+    add_round_key(block_bytes, expanded_keys[0]);
+    for round_key in expanded_keys.iter().take(10).skip(1) {
+        sub_bytes(block_bytes);
+        shift_rows(block_bytes);
+        mix_columns(block_bytes);
+        add_round_key(block_bytes, *round_key);
+    }
+    sub_bytes(block_bytes);
+    shift_rows(block_bytes);
+    add_round_key(block_bytes, expanded_keys[10]);
+}
+fn plain_aes_ctr(num_aes_inputs: usize, iv: u128, key: u128) -> Vec<u128> {
+    let expanded_keys = plain_key_expansion(key);
+    let mut results = Vec::with_capacity(num_aes_inputs);
+    for i in 0..num_aes_inputs {
+        let counter_value = iv.wrapping_add(i as u128);
+        let mut block = counter_value.to_be_bytes();
+        plain_aes_encrypt_block(&mut block, &expanded_keys);
+        results.push(u128::from_be_bytes(block));
+    }
+    results
+}
+
+fn internal_aes_fixed_parallelism_test<P, E>(param: P, mut executor: E, num_aes_inputs: usize)
+where
+    P: Into<TestParameters>,
+    E: for<'a> FunctionExecutor<
+        (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
+        RadixCiphertext,
+    >,
+{
+    let param = param.into();
+    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+    let cks = RadixClientKey::from((cks, 1));
+    let sks = Arc::new(sks);
+    executor.setup(&cks, sks);
+
+    let key: u128 = 0x2b7e151628aed2a6abf7158809cf4f3c;
+    let iv: u128 = 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff;
+
+    let plain_results = plain_aes_ctr(num_aes_inputs, iv, key);
+
+    let ctxt_key = cks.encrypt_u128_for_aes_ctr(key);
+    let ctxt_iv = cks.encrypt_u128_for_aes_ctr(iv);
+
+    for sbox_parallelism in [1, 2, 4, 8, 16] {
+        let encrypted_result =
+            executor.execute((&ctxt_key, &ctxt_iv, 0, num_aes_inputs, sbox_parallelism));
+        let fhe_results = cks.decrypt_u128_from_aes_ctr(&encrypted_result, num_aes_inputs);
+        assert_eq!(fhe_results, plain_results);
+    }
+}
+
+pub fn aes_fixed_parallelism_1_input_test<P, E>(param: P, executor: E)
+where
+    P: Into<TestParameters>,
+    E: for<'a> FunctionExecutor<
+        (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
+        RadixCiphertext,
+    >,
+{
+    internal_aes_fixed_parallelism_test(param, executor, 1);
+}
+
+pub fn aes_fixed_parallelism_2_inputs_test<P, E>(param: P, executor: E)
+where
+    P: Into<TestParameters>,
+    E: for<'a> FunctionExecutor<
+        (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
+        RadixCiphertext,
+    >,
+{
+    internal_aes_fixed_parallelism_test(param, executor, 2);
+}
+
+pub fn aes_dynamic_parallelism_many_inputs_test<P, E>(param: P, mut executor: E)
+where
+    P: Into<TestParameters>,
+    E: for<'a> FunctionExecutor<
+        (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize),
+        RadixCiphertext,
+    >,
+{
+    let param = param.into();
+    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+    let cks = RadixClientKey::from((cks, 1));
+    let sks = Arc::new(sks);
+    executor.setup(&cks, sks);
+
+    let key: u128 = 0x2b7e151628aed2a6abf7158809cf4f3c;
+    let iv: u128 = 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff;
+
+    let ctxt_key = cks.encrypt_u128_for_aes_ctr(key);
+    let ctxt_iv = cks.encrypt_u128_for_aes_ctr(iv);
+
+    for num_aes_inputs in [4, 8, 16, 32] {
+        let plain_results = plain_aes_ctr(num_aes_inputs, iv, key);
+        let encrypted_result = executor.execute((&ctxt_key, &ctxt_iv, 0, num_aes_inputs));
+        let fhe_results = cks.decrypt_u128_from_aes_ctr(&encrypted_result, num_aes_inputs);
+        assert_eq!(fhe_results, plain_results);
+    }
+}