refactor(gpu): AES 64

2026-01-08 22:28:01 -05:00 · 2025-10-28 10:42:13 +01:00
parent 95593b1ea9
commit cfb1d1340e
7 changed files with 95 additions and 168 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
@@ -20,7 +20,7 @@ template <typename Torus> struct int_aes_lut_buffers {
                      bool allocate_gpu_memory, uint32_t num_aes_inputs,
                      uint32_t sbox_parallelism, uint64_t &size_tracker) {

-    constexpr uint32_t AES_STATE_BITS = 128;
+    constexpr uint32_t AES_STATE_BITS = 64;
    constexpr uint32_t SBOX_MAX_AND_GATES = 18;

    this->and_lut = new int_radix_lut<Torus>(
@@ -238,7 +238,7 @@ template <typename Torus> struct int_aes_main_workspaces {
                          bool allocate_gpu_memory, uint32_t num_aes_inputs,
                          uint32_t sbox_parallelism, uint64_t &size_tracker) {

-    constexpr uint32_t AES_STATE_BITS = 128;
+    constexpr uint32_t AES_STATE_BITS = 64;
    constexpr uint32_t SBOX_MAX_AND_GATES = 18;
    constexpr uint32_t BATCH_BUFFER_OPERANDS = 3;

@@ -397,7 +397,7 @@ template <typename Torus> struct int_key_expansion_buffer {
    this->params = params;
    this->allocate_gpu_memory = allocate_gpu_memory;

-    constexpr uint32_t TOTAL_WORDS = 44;
+    constexpr uint32_t TOTAL_WORDS = 22;
    constexpr uint32_t BITS_PER_WORD = 32;
    constexpr uint32_t TOTAL_BITS = TOTAL_WORDS * BITS_PER_WORD;

--- a/backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
@@ -621,7 +621,7 @@ __host__ void vectorized_shift_rows(CudaStreams streams,
                                    CudaRadixCiphertextFFI *state_bitsliced,
                                    uint32_t num_aes_inputs,
                                    int_aes_encrypt_buffer<Torus> *mem) {
-  constexpr uint32_t NUM_BYTES = 16;
+  constexpr uint32_t NUM_BYTES = 8;
  constexpr uint32_t LEN_BYTE = 8;
  constexpr uint32_t NUM_BITS = NUM_BYTES * LEN_BYTE;

@@ -649,8 +649,7 @@ __host__ void vectorized_shift_rows(CudaStreams streams,
        i * num_aes_inputs, (i + 1) * num_aes_inputs);
  }

-  const int shift_rows_map[] = {0, 5,  10, 15, 4,  9, 14, 3,
-                                8, 13, 2,  7,  12, 1, 6,  11};
+  const int shift_rows_map[] = {0, 1, 3, 2, 4, 5, 7, 6};

  for (int i = 0; i < NUM_BYTES; i++) {
    for (int bit = 0; bit < LEN_BYTE; bit++) {
@@ -711,7 +710,7 @@ __host__ void vectorized_mix_columns(CudaStreams streams,

  constexpr uint32_t BITS_PER_BYTE = 8;
  constexpr uint32_t BYTES_PER_COLUMN = 4;
-  constexpr uint32_t NUM_COLUMNS = 4;
+  constexpr uint32_t NUM_COLUMNS = 2;
  constexpr uint32_t BITS_PER_COLUMN = BYTES_PER_COLUMN * BITS_PER_BYTE;

  for (uint32_t col = 0; col < NUM_COLUMNS; ++col) {
@@ -849,7 +848,7 @@ __host__ void vectorized_aes_encrypt_inplace(
    int_aes_encrypt_buffer<Torus> *mem, void *const *bsks, Torus *const *ksks) {

  constexpr uint32_t BITS_PER_BYTE = 8;
-  constexpr uint32_t STATE_BYTES = 16;
+  constexpr uint32_t STATE_BYTES = 8;
  constexpr uint32_t STATE_BITS = STATE_BYTES * BITS_PER_BYTE;
  constexpr uint32_t ROUNDS = 10;

@@ -910,6 +909,7 @@ __host__ void vectorized_aes_encrypt_inplace(
                                       mem, bsks, ksks);
      }
      break;
+    case 16:
    case 8:
      for (uint32_t i = 0; i < STATE_BYTES; i += 8) {
        CudaRadixCiphertextFFI *sbox_inputs[] = {
@@ -921,19 +921,6 @@ __host__ void vectorized_aes_encrypt_inplace(
                                       mem, bsks, ksks);
      }
      break;
-    case 16: {
-      CudaRadixCiphertextFFI *sbox_inputs[] = {
-          &s_bits[0 * BITS_PER_BYTE],  &s_bits[1 * BITS_PER_BYTE],
-          &s_bits[2 * BITS_PER_BYTE],  &s_bits[3 * BITS_PER_BYTE],
-          &s_bits[4 * BITS_PER_BYTE],  &s_bits[5 * BITS_PER_BYTE],
-          &s_bits[6 * BITS_PER_BYTE],  &s_bits[7 * BITS_PER_BYTE],
-          &s_bits[8 * BITS_PER_BYTE],  &s_bits[9 * BITS_PER_BYTE],
-          &s_bits[10 * BITS_PER_BYTE], &s_bits[11 * BITS_PER_BYTE],
-          &s_bits[12 * BITS_PER_BYTE], &s_bits[13 * BITS_PER_BYTE],
-          &s_bits[14 * BITS_PER_BYTE], &s_bits[15 * BITS_PER_BYTE]};
-      vectorized_sbox_n_bytes<Torus>(streams, sbox_inputs, 16, num_aes_inputs,
-                                     mem, bsks, ksks);
-    } break;
    default:
      PANIC("Unsupported S-Box parallelism level selected: %u",
            sbox_parallelism);
@@ -993,7 +980,7 @@ __host__ void vectorized_aes_full_adder_inplace(
    const Torus *counter_bits_le_all_blocks, uint32_t num_aes_inputs,
    int_aes_encrypt_buffer<Torus> *mem, void *const *bsks, Torus *const *ksks) {

-  constexpr uint32_t NUM_BITS = 128;
+  constexpr uint32_t NUM_BITS = 64;

  // --- Initialization ---
  CudaRadixCiphertextFFI *carry_vec =
@@ -1098,7 +1085,7 @@ __host__ void host_integer_aes_ctr_encrypt(
    const Torus *counter_bits_le_all_blocks, uint32_t num_aes_inputs,
    int_aes_encrypt_buffer<Torus> *mem, void *const *bsks, Torus *const *ksks) {

-  constexpr uint32_t NUM_BITS = 128;
+  constexpr uint32_t NUM_BITS = 64;

  CudaRadixCiphertextFFI *initial_states =
      mem->main_workspaces->initial_states_and_jit_key_workspace;
@@ -1159,8 +1146,8 @@ __host__ void host_integer_key_expansion(CudaStreams streams,
  constexpr uint32_t BITS_PER_WORD = 32;
  constexpr uint32_t BITS_PER_BYTE = 8;
  constexpr uint32_t BYTES_PER_WORD = 4;
-  constexpr uint32_t TOTAL_WORDS = 44;
-  constexpr uint32_t KEY_WORDS = 4;
+  constexpr uint32_t TOTAL_WORDS = 22;
+  constexpr uint32_t KEY_WORDS = 2;

  const Torus rcon[] = {0x01, 0x02, 0x04, 0x08, 0x10,
                        0x20, 0x40, 0x80, 0x1b, 0x36};
@@ -1178,8 +1165,8 @@ __host__ void host_integer_key_expansion(CudaStreams streams,

    as_radix_ciphertext_slice<Torus>(&tmp_word_buffer, mem->tmp_word_buffer, 0,
                                     BITS_PER_WORD);
-    as_radix_ciphertext_slice<Torus>(&tmp_far, words, (w - 4) * BITS_PER_WORD,
-                                     (w - 3) * BITS_PER_WORD);
+    as_radix_ciphertext_slice<Torus>(&tmp_far, words, (w - 2) * BITS_PER_WORD,
+                                     (w - 1) * BITS_PER_WORD);
    as_radix_ciphertext_slice<Torus>(&tmp_near, words, (w - 1) * BITS_PER_WORD,
                                     w * BITS_PER_WORD);

--- a/tfhe-benchmark/benches/integer/aes.rs
+++ b/tfhe-benchmark/benches/integer/aes.rs
@@ -23,9 +23,9 @@ pub mod cuda {
        let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
        let atomic_param: AtomicPatternParameters = param.into();

-        let key: u128 = 0x2b7e151628aed2a6abf7158809cf4f3c;
-        let iv: u128 = 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff;
-        let aes_op_bit_size = 128;
+        let key: u64 = 0x2b7e151628aed2a6;
+        let iv: u64 = 0xf0f1f2f3f4f5f6f7;
+        let aes_op_bit_size = 64;

        let param_name = param.name();

@@ -36,14 +36,14 @@ pub mod cuda {
                let sks = CudaServerKey::new(&cpu_cks, &streams);
                let cks = RadixClientKey::from((cpu_cks, 1));

-                let ct_key = cks.encrypt_u128_for_aes_ctr(key);
-                let ct_iv = cks.encrypt_u128_for_aes_ctr(iv);
+                let ct_key = cks.encrypt_u64_for_aes_ctr(key);
+                let ct_iv = cks.encrypt_u64_for_aes_ctr(iv);

                let d_key = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_key, &streams);
                let d_iv = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_iv, &streams);

                {
-                    const NUM_AES_INPUTS: usize = 1;
+                    const NUM_AES_INPUTS: usize = 2;
                    const SBOX_PARALLELISM: usize = 16;
                    let bench_id = format!("{param_name}::{NUM_AES_INPUTS}_input_encryption");

@@ -105,8 +105,8 @@ pub mod cuda {

                bench_group.throughput(Throughput::Elements(NUM_AES_INPUTS as u64));

-                let ct_key = cks.encrypt_u128_for_aes_ctr(key);
-                let ct_iv = cks.encrypt_u128_for_aes_ctr(iv);
+                let ct_key = cks.encrypt_u64_for_aes_ctr(key);
+                let ct_iv = cks.encrypt_u64_for_aes_ctr(iv);

                let d_key = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_key, &streams);
                let d_iv = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_iv, &streams);
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -7490,7 +7490,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_aes_ctr_encrypt<T: UnsignedInteger,
    output: &mut CudaRadixCiphertext,
    iv: &CudaRadixCiphertext,
    round_keys: &CudaRadixCiphertext,
-    start_counter: u128,
+    start_counter: u64,
    num_aes_inputs: u32,
    sbox_parallelism: u32,
    bootstrapping_key: &CudaVec<B>,
@@ -7534,8 +7534,8 @@ pub(crate) unsafe fn cuda_backend_unchecked_aes_ctr_encrypt<T: UnsignedInteger,

    let counter_bits_le: Vec<u64> = (0..num_aes_inputs)
        .flat_map(|i| {
-            let current_counter = start_counter + i as u128;
-            (0..128).map(move |bit_index| ((current_counter >> bit_index) & 1) as u64)
+            let current_counter = start_counter + i as u64;
+            (0..64).map(move |bit_index| (current_counter >> bit_index) & 1)
        })
        .collect();

--- a/tfhe/src/integer/gpu/server_key/radix/aes.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/aes.rs
@@ -13,88 +13,32 @@ use crate::integer::gpu::{
 use crate::integer::{RadixCiphertext, RadixClientKey};
 use crate::shortint::Ciphertext;

-const NUM_BITS: usize = 128;
+const NUM_BITS: usize = 64;

 impl RadixClientKey {
-    /// Encrypts a 128-bit block for homomorphic AES evaluation.
-    ///
-    /// This function prepares a 128-bit plaintext block (like an AES key or IV)
-    /// for homomorphic processing by decomposing it into its 128 constituent bits
-    /// and encrypting each bit individually with FHE.
-    ///
-    /// The process is as follows:
-    /// ```text
-    /// // INPUT: A 128-bit plaintext block
-    /// Plaintext block (u128): 0x2b7e1516...
-    ///       |
-    ///       V
-    /// // 1. Decompose the block into individual bits
-    /// Individual bits: [b127, b126, ..., b1, b0]
-    ///       |
-    ///       V
-    /// // 2. Encrypt each bit individually using FHE
-    /// `self.encrypt(bit)` is applied to each bit
-    ///       |
-    ///       V
-    /// // 3. Collect the resulting bit-ciphertexts
-    /// Ciphertexts: [Ct(b127), Ct(b126), ..., Ct(b0)]
-    ///       |
-    ///       V
-    /// // 4. Group the bit-ciphertexts into a single RadixCiphertext
-    /// //    representing the full encrypted block.
-    /// // OUTPUT: A RadixCiphertext
-    /// ```
-    pub fn encrypt_u128_for_aes_ctr(&self, data: u128) -> RadixCiphertext {
+    pub fn encrypt_u64_for_aes_ctr(&self, data: u64) -> RadixCiphertext {
        let mut blocks: Vec<Ciphertext> = Vec::with_capacity(NUM_BITS);
        for i in 0..NUM_BITS {
-            let bit = ((data >> (NUM_BITS - 1 - i)) & 1) as u64;
+            let bit = (data >> (NUM_BITS - 1 - i)) & 1;
            blocks.extend(self.encrypt(bit).blocks);
        }
        RadixCiphertext::from(blocks)
    }

-    /// Decrypts a `RadixCiphertext` containing one or more 128-bit blocks
-    /// that were homomorphically processed.
-    ///
-    /// This function reverses the encryption process by decrypting each individual
-    /// bit-ciphertext and reassembling them into 128-bit plaintext blocks.
-    ///
-    /// The process is as follows:
-    /// ```text
-    /// // INPUT: RadixCiphertext containing one or more encrypted blocks
-    /// Ciphertext collection: [Ct(b127), ..., Ct(b0), Ct(b'127), ..., Ct(b'0), ...]
-    ///       |
-    ///       | (For each sequence of 128 bit-ciphertexts)
-    ///       V
-    /// // 1. Decrypt each bit's ciphertext individually
-    /// `self.decrypt(Ct)` is applied to each bit-ciphertext
-    ///       |
-    ///       V
-    /// // 2. Collect the resulting plaintext bits
-    /// Plaintext bits: [b127, b126, ..., b0]
-    ///       |
-    ///       V
-    /// // 3. Assemble the bits back into a 128-bit block
-    /// Reconstruction: ( ...((b127 << 1) | b126) << 1 | ... ) | b0
-    ///       |
-    ///       V
-    /// // OUTPUT: A vector of plaintext u128 blocks
-    /// Plaintext u128s: [0x..., ...]
-    /// ```
-    pub fn decrypt_u128_from_aes_ctr(
+    pub fn decrypt_u64_from_aes_ctr(
        &self,
        encrypted_result: &RadixCiphertext,
        num_aes_inputs: usize,
-    ) -> Vec<u128> {
+    ) -> Vec<u64> {
        let mut plaintext_results = Vec::with_capacity(num_aes_inputs);
        for i in 0..num_aes_inputs {
-            let mut current_block_plaintext: u128 = 0;
+            let mut current_block_plaintext: u64 = 0;
            let block_start_index = i * NUM_BITS;
            for j in 0..NUM_BITS {
                let block_slice =
                    &encrypted_result.blocks[block_start_index + j..block_start_index + j + 1];
                let block_radix_ct = RadixCiphertext::from(block_slice.to_vec());
-                let decrypted_bit: u128 = self.decrypt(&block_radix_ct);
+                let decrypted_bit: u64 = self.decrypt(&block_radix_ct);
                current_block_plaintext = (current_block_plaintext << 1) | decrypted_bit;
            }
            plaintext_results.push(current_block_plaintext);
@@ -108,7 +52,7 @@ impl CudaServerKey {
        &self,
        key: &CudaUnsignedRadixCiphertext,
        iv: &CudaUnsignedRadixCiphertext,
-        start_counter: u128,
+        start_counter: u64,
        num_aes_inputs: usize,
        streams: &CudaStreams,
    ) -> CudaUnsignedRadixCiphertext {
@@ -154,7 +98,7 @@ impl CudaServerKey {
        &self,
        key: &CudaUnsignedRadixCiphertext,
        iv: &CudaUnsignedRadixCiphertext,
-        start_counter: u128,
+        start_counter: u64,
        num_aes_inputs: usize,
        sbox_parallelism: usize,
        streams: &CudaStreams,
@@ -188,13 +132,13 @@ impl CudaServerKey {
        &self,
        iv: &CudaUnsignedRadixCiphertext,
        round_keys: &CudaUnsignedRadixCiphertext,
-        start_counter: u128,
+        start_counter: u64,
        num_aes_inputs: usize,
        sbox_parallelism: usize,
        streams: &CudaStreams,
    ) -> CudaUnsignedRadixCiphertext {
        let mut result: CudaUnsignedRadixCiphertext =
-            self.create_trivial_zero_radix(num_aes_inputs * 128, streams);
+            self.create_trivial_zero_radix(num_aes_inputs * NUM_BITS, streams);

        let num_round_key_blocks = 11 * NUM_BITS;

@@ -212,9 +156,9 @@ impl CudaServerKey {
        );
        assert_eq!(
            result.as_ref().d_blocks.lwe_ciphertext_count().0,
-            num_aes_inputs * 128,
+            num_aes_inputs * NUM_BITS,
            "AES result must contain {} encrypted bits for {num_aes_inputs} blocks, but contains {}",
-            num_aes_inputs * 128,
+            num_aes_inputs * NUM_BITS,
            result.as_ref().d_blocks.lwe_ciphertext_count().0
        );

@@ -327,7 +271,7 @@ impl CudaServerKey {
        streams: &CudaStreams,
    ) -> CudaUnsignedRadixCiphertext {
        let num_round_keys = 11;
-        let num_key_bits = 128;
+        let num_key_bits = 64;
        let mut expanded_keys: CudaUnsignedRadixCiphertext =
            self.create_trivial_zero_radix(num_round_keys * num_key_bits, streams);

--- a/tfhe/src/integer/gpu/server_key/radix/tests_unsigned/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_unsigned/mod.rs
@@ -85,16 +85,14 @@ impl<F> GpuFunctionExecutor<F> {
 }

 impl<'a, F>
-    FunctionExecutor<
-        (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
-        RadixCiphertext,
-    > for GpuFunctionExecutor<F>
+    FunctionExecutor<(&'a RadixCiphertext, &'a RadixCiphertext, u64, usize, usize), RadixCiphertext>
+    for GpuFunctionExecutor<F>
 where
    F: Fn(
        &CudaServerKey,
        &CudaUnsignedRadixCiphertext,
        &CudaUnsignedRadixCiphertext,
-        u128,
+        u64,
        usize,
        usize,
        &CudaStreams,
@@ -106,7 +104,7 @@ where

    fn execute(
        &mut self,
-        input: (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
+        input: (&'a RadixCiphertext, &'a RadixCiphertext, u64, usize, usize),
    ) -> RadixCiphertext {
        let context = self
            .context
@@ -133,14 +131,14 @@ where
 }

 impl<'a, F>
-    FunctionExecutor<(&'a RadixCiphertext, &'a RadixCiphertext, u128, usize), RadixCiphertext>
+    FunctionExecutor<(&'a RadixCiphertext, &'a RadixCiphertext, u64, usize), RadixCiphertext>
    for GpuFunctionExecutor<F>
 where
    F: Fn(
        &CudaServerKey,
        &CudaUnsignedRadixCiphertext,
        &CudaUnsignedRadixCiphertext,
-        u128,
+        u64,
        usize,
        &CudaStreams,
    ) -> CudaUnsignedRadixCiphertext,
@@ -151,7 +149,7 @@ where

    fn execute(
        &mut self,
-        input: (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize),
+        input: (&'a RadixCiphertext, &'a RadixCiphertext, u64, usize),
    ) -> RadixCiphertext {
        let context = self
            .context
--- a/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/test_aes.rs
+++ b/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/test_aes.rs
@@ -25,57 +25,54 @@ const S_BOX: [u8; 256] = [
    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
 ];

-fn plain_key_expansion(key: u128) -> Vec<u128> {
+fn plain_key_expansion(key: u64) -> Vec<u64> {
    const RCON: [u32; 10] = [
        0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000,
        0x80000000, 0x1B000000, 0x36000000,
    ];
-    let mut words = [0u32; 44];
-    for (i, word) in words.iter_mut().enumerate().take(4) {
-        *word = (key >> (96 - (i * 32))) as u32;
+    // 64-bit key (2 words) * 11 rounds = 22 words
+    const KEY_WORDS: usize = 2;
+    const TOTAL_WORDS: usize = 22;
+
+    let mut words = [0u32; TOTAL_WORDS];
+    for (i, word) in words.iter_mut().enumerate().take(KEY_WORDS) {
+        *word = (key >> (32 - (i * 32))) as u32;
    }
-    for i in 4..44 {
+
+    for i in KEY_WORDS..TOTAL_WORDS {
        let mut temp = words[i - 1];
-        if i % 4 == 0 {
+        if i % KEY_WORDS == 0 {
            temp = temp.rotate_left(8);
            let mut sub_bytes = 0u32;
            for j in 0..4 {
                let byte = (temp >> (24 - j * 8)) as u8;
                sub_bytes |= (S_BOX[byte as usize] as u32) << (24 - j * 8);
            }
-            temp = sub_bytes ^ RCON[i / 4 - 1];
+            temp = sub_bytes ^ RCON[i / KEY_WORDS - 1];
        }
-        words[i] = words[i - 4] ^ temp;
+        words[i] = words[i - KEY_WORDS] ^ temp;
    }
    words
-        .chunks_exact(4)
-        .map(|chunk| {
-            ((chunk[0] as u128) << 96)
-                | ((chunk[1] as u128) << 64)
-                | ((chunk[2] as u128) << 32)
-                | (chunk[3] as u128)
-        })
+        .chunks_exact(KEY_WORDS)
+        .map(|chunk| ((chunk[0] as u64) << 32) | (chunk[1] as u64))
        .collect()
 }
-fn sub_bytes(state: &mut [u8; 16]) {
+fn sub_bytes(state: &mut [u8; 8]) {
    for byte in state.iter_mut() {
        *byte = S_BOX[*byte as usize];
    }
 }
-fn shift_rows(state: &mut [u8; 16]) {
+fn shift_rows(state: &mut [u8; 8]) {
+    // 4x2 state
+    // Row 0: s0, s1 (no shift)
+    // Row 1: s2, s3 (shift 1)
+    // Row 2: s4, s5 (shift 2 -> no shift)
+    // Row 3: s6, s7 (shift 3 -> shift 1)
    let original = *state;
-    state[1] = original[5];
-    state[5] = original[9];
-    state[9] = original[13];
-    state[13] = original[1];
-    state[2] = original[10];
-    state[6] = original[14];
-    state[10] = original[2];
-    state[14] = original[6];
-    state[3] = original[15];
-    state[7] = original[3];
-    state[11] = original[7];
-    state[15] = original[11];
+    state[2] = original[3];
+    state[3] = original[2];
+    state[6] = original[7];
+    state[7] = original[6];
 }
 fn gmul(mut a: u8, mut b: u8) -> u8 {
    let mut p = 0;
@@ -92,9 +89,10 @@ fn gmul(mut a: u8, mut b: u8) -> u8 {
    }
    p
 }
-fn mix_columns(state: &mut [u8; 16]) {
+fn mix_columns(state: &mut [u8; 8]) {
    let original = *state;
-    for i in 0..4 {
+    // 2 columns
+    for i in 0..2 {
        let col = i * 4;
        state[col] = gmul(original[col], 2)
            ^ gmul(original[col + 1], 3)
@@ -114,13 +112,13 @@ fn mix_columns(state: &mut [u8; 16]) {
            ^ gmul(original[col + 3], 2);
    }
 }
-fn add_round_key(state: &mut [u8; 16], round_key: u128) {
+fn add_round_key(state: &mut [u8; 8], round_key: u64) {
    let key_bytes = round_key.to_be_bytes();
-    for i in 0..16 {
+    for i in 0..8 {
        state[i] ^= key_bytes[i];
    }
 }
-fn plain_aes_encrypt_block(block_bytes: &mut [u8; 16], expanded_keys: &[u128]) {
+fn plain_aes_encrypt_block(block_bytes: &mut [u8; 8], expanded_keys: &[u64]) {
    add_round_key(block_bytes, expanded_keys[0]);
    for round_key in expanded_keys.iter().take(10).skip(1) {
        sub_bytes(block_bytes);
@@ -132,14 +130,14 @@ fn plain_aes_encrypt_block(block_bytes: &mut [u8; 16], expanded_keys: &[u128]) {
    shift_rows(block_bytes);
    add_round_key(block_bytes, expanded_keys[10]);
 }
-fn plain_aes_ctr(num_aes_inputs: usize, iv: u128, key: u128) -> Vec<u128> {
+fn plain_aes_ctr(num_aes_inputs: usize, iv: u64, key: u64) -> Vec<u64> {
    let expanded_keys = plain_key_expansion(key);
    let mut results = Vec::with_capacity(num_aes_inputs);
    for i in 0..num_aes_inputs {
-        let counter_value = iv.wrapping_add(i as u128);
+        let counter_value = iv.wrapping_add(i as u64);
        let mut block = counter_value.to_be_bytes();
        plain_aes_encrypt_block(&mut block, &expanded_keys);
-        results.push(u128::from_be_bytes(block));
+        results.push(u64::from_be_bytes(block));
    }
    results
 }
@@ -148,7 +146,7 @@ fn internal_aes_fixed_parallelism_test<P, E>(param: P, mut executor: E, num_aes_
 where
    P: Into<TestParameters>,
    E: for<'a> FunctionExecutor<
-        (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
+        (&'a RadixCiphertext, &'a RadixCiphertext, u64, usize, usize),
        RadixCiphertext,
    >,
 {
@@ -158,18 +156,18 @@ where
    let sks = Arc::new(sks);
    executor.setup(&cks, sks);

-    let key: u128 = 0x2b7e151628aed2a6abf7158809cf4f3c;
-    let iv: u128 = 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff;
+    let key: u64 = 0x2b7e151628aed2a6;
+    let iv: u64 = 0xf0f1f2f3f4f5f6f7;

    let plain_results = plain_aes_ctr(num_aes_inputs, iv, key);

-    let ctxt_key = cks.encrypt_u128_for_aes_ctr(key);
-    let ctxt_iv = cks.encrypt_u128_for_aes_ctr(iv);
+    let ctxt_key = cks.encrypt_u64_for_aes_ctr(key);
+    let ctxt_iv = cks.encrypt_u64_for_aes_ctr(iv);

    for sbox_parallelism in [1, 2, 4, 8, 16] {
        let encrypted_result =
            executor.execute((&ctxt_key, &ctxt_iv, 0, num_aes_inputs, sbox_parallelism));
-        let fhe_results = cks.decrypt_u128_from_aes_ctr(&encrypted_result, num_aes_inputs);
+        let fhe_results = cks.decrypt_u64_from_aes_ctr(&encrypted_result, num_aes_inputs);
        assert_eq!(fhe_results, plain_results);
    }
 }
@@ -178,7 +176,7 @@ pub fn aes_fixed_parallelism_1_input_test<P, E>(param: P, executor: E)
 where
    P: Into<TestParameters>,
    E: for<'a> FunctionExecutor<
-        (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
+        (&'a RadixCiphertext, &'a RadixCiphertext, u64, usize, usize),
        RadixCiphertext,
    >,
 {
@@ -189,7 +187,7 @@ pub fn aes_fixed_parallelism_2_inputs_test<P, E>(param: P, executor: E)
 where
    P: Into<TestParameters>,
    E: for<'a> FunctionExecutor<
-        (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
+        (&'a RadixCiphertext, &'a RadixCiphertext, u64, usize, usize),
        RadixCiphertext,
    >,
 {
@@ -200,7 +198,7 @@ pub fn aes_dynamic_parallelism_many_inputs_test<P, E>(param: P, mut executor: E)
 where
    P: Into<TestParameters>,
    E: for<'a> FunctionExecutor<
-        (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize),
+        (&'a RadixCiphertext, &'a RadixCiphertext, u64, usize),
        RadixCiphertext,
    >,
 {
@@ -210,16 +208,16 @@ where
    let sks = Arc::new(sks);
    executor.setup(&cks, sks);

-    let key: u128 = 0x2b7e151628aed2a6abf7158809cf4f3c;
-    let iv: u128 = 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff;
+    let key: u64 = 0x2b7e151628aed2a6;
+    let iv: u64 = 0xf0f1f2f3f4f5f6f7;

-    let ctxt_key = cks.encrypt_u128_for_aes_ctr(key);
-    let ctxt_iv = cks.encrypt_u128_for_aes_ctr(iv);
+    let ctxt_key = cks.encrypt_u64_for_aes_ctr(key);
+    let ctxt_iv = cks.encrypt_u64_for_aes_ctr(iv);

    for num_aes_inputs in [4, 8, 16, 32] {
        let plain_results = plain_aes_ctr(num_aes_inputs, iv, key);
        let encrypted_result = executor.execute((&ctxt_key, &ctxt_iv, 0, num_aes_inputs));
-        let fhe_results = cks.decrypt_u128_from_aes_ctr(&encrypted_result, num_aes_inputs);
+        let fhe_results = cks.decrypt_u64_from_aes_ctr(&encrypted_result, num_aes_inputs);
        assert_eq!(fhe_results, plain_results);
    }
 }