chore(gpu): remove async bitop cmux comparisons neg

2026-01-09 14:47:56 -05:00 · 2025-10-17 18:40:10 +02:00
parent cf20d73a5f
commit 4322214d8f
5 changed files with 423 additions and 919 deletions
--- a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
@@ -64,11 +64,7 @@ impl CudaServerKey {
        result
    }

-    /// # Safety
-    ///
-    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronised
-    pub unsafe fn unchecked_bitnot_assign_async<T: CudaIntegerRadixCiphertext>(
+    pub fn unchecked_bitnot_assign<T: CudaIntegerRadixCiphertext>(
        &self,
        ct: &mut T,
        streams: &CudaStreams,
@@ -83,9 +79,13 @@ impl CudaServerKey {
        let shift_plaintext = self.encoding().encode(Cleartext(u64::from(scalar))).0;

        let scalar_vector = vec![shift_plaintext; ct_blocks];
-        let mut d_decomposed_scalar =
-            CudaVec::<u64>::new_async(ct.as_ref().d_blocks.lwe_ciphertext_count().0, streams, 0);
-        d_decomposed_scalar.copy_from_cpu_async(scalar_vector.as_slice(), streams, 0);
+
+        let mut d_decomposed_scalar = unsafe {
+            CudaVec::<u64>::new_async(ct.as_ref().d_blocks.lwe_ciphertext_count().0, streams, 0)
+        };
+        unsafe {
+            d_decomposed_scalar.copy_from_cpu_async(scalar_vector.as_slice(), streams, 0);
+        }

        cuda_lwe_ciphertext_plaintext_add_assign(
            &mut ct.as_mut().d_blocks,
@@ -95,7 +95,7 @@ impl CudaServerKey {
        ct.as_mut().info = ct.as_ref().info.after_bitnot();
    }

-    pub(crate) unsafe fn unchecked_boolean_bitnot_assign_async(
+    pub fn unchecked_boolean_bitnot_assign(
        &self,
        ct: &mut CudaBooleanBlock,
        streams: &CudaStreams,
@@ -108,9 +108,12 @@ impl CudaServerKey {
        let shift_plaintext = self.encoding().encode(Cleartext(1u64)).0;

        let scalar_vector = vec![shift_plaintext; ct_blocks];
-        let mut d_decomposed_scalar =
-            CudaVec::<u64>::new_async(ct.0.as_ref().d_blocks.lwe_ciphertext_count().0, streams, 0);
-        d_decomposed_scalar.copy_from_cpu_async(scalar_vector.as_slice(), streams, 0);
+        let mut d_decomposed_scalar = unsafe {
+            CudaVec::<u64>::new_async(ct.0.as_ref().d_blocks.lwe_ciphertext_count().0, streams, 0)
+        };
+        unsafe {
+            d_decomposed_scalar.copy_from_cpu_async(scalar_vector.as_slice(), streams, 0);
+        }

        cuda_lwe_ciphertext_plaintext_add_assign(
            &mut ct.0.as_mut().d_blocks,
@@ -120,17 +123,6 @@ impl CudaServerKey {
        // Neither noise level nor the degree changes
    }

-    pub fn unchecked_bitnot_assign<T: CudaIntegerRadixCiphertext>(
-        &self,
-        ct: &mut T,
-        streams: &CudaStreams,
-    ) {
-        unsafe {
-            self.unchecked_bitnot_assign_async(ct, streams);
-        }
-        streams.synchronize();
-    }
-
    /// Computes homomorphically bitand between two ciphertexts encrypting integer values.
    ///
    /// This function computes the operation without checking if it exceeds the capacity of the
@@ -185,11 +177,7 @@ impl CudaServerKey {
        result
    }

-    /// # Safety
-    ///
-    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronized
-    pub unsafe fn unchecked_bitop_assign_async<T: CudaIntegerRadixCiphertext>(
+    pub(crate) fn unchecked_bitop_assign<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
        ct_right: &T,
@@ -207,62 +195,64 @@ impl CudaServerKey {

        let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();

-        match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => {
-                cuda_backend_unchecked_bitop_assign(
-                    streams,
-                    ct_left.as_mut(),
-                    ct_right.as_ref(),
-                    &d_bsk.d_vec,
-                    &self.key_switching_key.d_vec,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_bsk.glwe_dimension,
-                    d_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count,
-                    d_bsk.decomp_base_log,
-                    op,
-                    lwe_ciphertext_count.0 as u32,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                    d_bsk.ms_noise_reduction_configuration.as_ref(),
-                );
-            }
-            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                cuda_backend_unchecked_bitop_assign(
-                    streams,
-                    ct_left.as_mut(),
-                    ct_right.as_ref(),
-                    &d_multibit_bsk.d_vec,
-                    &self.key_switching_key.d_vec,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_multibit_bsk.glwe_dimension,
-                    d_multibit_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_multibit_bsk.decomp_level_count,
-                    d_multibit_bsk.decomp_base_log,
-                    op,
-                    lwe_ciphertext_count.0 as u32,
-                    PBSType::MultiBit,
-                    d_multibit_bsk.grouping_factor,
-                    None,
-                );
+        unsafe {
+            match &self.bootstrapping_key {
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_unchecked_bitop_assign(
+                        streams,
+                        ct_left.as_mut(),
+                        ct_right.as_ref(),
+                        &d_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_bsk.glwe_dimension,
+                        d_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count,
+                        d_bsk.decomp_base_log,
+                        op,
+                        lwe_ciphertext_count.0 as u32,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    );
+                }
+                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                    cuda_backend_unchecked_bitop_assign(
+                        streams,
+                        ct_left.as_mut(),
+                        ct_right.as_ref(),
+                        &d_multibit_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_multibit_bsk.glwe_dimension,
+                        d_multibit_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_multibit_bsk.decomp_level_count,
+                        d_multibit_bsk.decomp_base_log,
+                        op,
+                        lwe_ciphertext_count.0 as u32,
+                        PBSType::MultiBit,
+                        d_multibit_bsk.grouping_factor,
+                        None,
+                    );
+                }
            }
        }
    }
@@ -385,10 +375,7 @@ impl CudaServerKey {
        ct_right: &T,
        streams: &CudaStreams,
    ) {
-        unsafe {
-            self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::And, streams);
-        }
-        streams.synchronize();
+        self.unchecked_bitop_assign(ct_left, ct_right, BitOpType::And, streams);
    }

    /// Computes homomorphically bitor between two ciphertexts encrypting integer values.
@@ -451,10 +438,7 @@ impl CudaServerKey {
        ct_right: &T,
        streams: &CudaStreams,
    ) {
-        unsafe {
-            self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::Or, streams);
-        }
-        streams.synchronize();
+        self.unchecked_bitop_assign(ct_left, ct_right, BitOpType::Or, streams);
    }

    /// Computes homomorphically bitxor between two ciphertexts encrypting integer values.
@@ -517,10 +501,7 @@ impl CudaServerKey {
        ct_right: &T,
        streams: &CudaStreams,
    ) {
-        unsafe {
-            self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::Xor, streams);
-        }
-        streams.synchronize();
+        self.unchecked_bitop_assign(ct_left, ct_right, BitOpType::Xor, streams);
    }

    /// Computes homomorphically bitand between two ciphertexts encrypting integer values.
@@ -577,11 +558,7 @@ impl CudaServerKey {
        result
    }

-    /// # Safety
-    ///
-    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronized
-    pub unsafe fn bitand_assign_async<T: CudaIntegerRadixCiphertext>(
+    pub fn bitand_assign<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
        ct_right: &T,
@@ -611,19 +588,7 @@ impl CudaServerKey {
                (ct_left, &tmp_rhs)
            }
        };
-        self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::And, streams);
-    }
-
-    pub fn bitand_assign<T: CudaIntegerRadixCiphertext>(
-        &self,
-        ct_left: &mut T,
-        ct_right: &T,
-        streams: &CudaStreams,
-    ) {
-        unsafe {
-            self.bitand_assign_async(ct_left, ct_right, streams);
-        }
-        streams.synchronize();
+        self.unchecked_bitop_assign(lhs, rhs, BitOpType::And, streams);
    }

    /// Computes homomorphically bitor between two ciphertexts encrypting integer values.
@@ -680,11 +645,7 @@ impl CudaServerKey {
        result
    }

-    /// # Safety
-    ///
-    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronized
-    pub unsafe fn bitor_assign_async<T: CudaIntegerRadixCiphertext>(
+    pub fn bitor_assign<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
        ct_right: &T,
@@ -715,19 +676,7 @@ impl CudaServerKey {
            }
        };

-        self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::Or, streams);
-    }
-
-    pub fn bitor_assign<T: CudaIntegerRadixCiphertext>(
-        &self,
-        ct_left: &mut T,
-        ct_right: &T,
-        streams: &CudaStreams,
-    ) {
-        unsafe {
-            self.bitor_assign_async(ct_left, ct_right, streams);
-        }
-        streams.synchronize();
+        self.unchecked_bitop_assign(lhs, rhs, BitOpType::Or, streams);
    }

    /// Computes homomorphically bitxor between two ciphertexts encrypting integer values.
@@ -784,11 +733,7 @@ impl CudaServerKey {
        result
    }

-    /// # Safety
-    ///
-    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronized
-    pub unsafe fn bitxor_assign_async<T: CudaIntegerRadixCiphertext>(
+    pub fn bitxor_assign<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
        ct_right: &T,
@@ -819,19 +764,7 @@ impl CudaServerKey {
            }
        };

-        self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::Xor, streams);
-    }
-
-    pub fn bitxor_assign<T: CudaIntegerRadixCiphertext>(
-        &self,
-        ct_left: &mut T,
-        ct_right: &T,
-        streams: &CudaStreams,
-    ) {
-        unsafe {
-            self.bitxor_assign_async(ct_left, ct_right, streams);
-        }
-        streams.synchronize();
+        self.unchecked_bitop_assign(lhs, rhs, BitOpType::Xor, streams);
    }

    /// Computes homomorphically bitnot for an encrypted integer value.
@@ -880,28 +813,14 @@ impl CudaServerKey {
        result
    }

-    /// # Safety
-    ///
-    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronized
-    pub unsafe fn bitnot_assign_async<T: CudaIntegerRadixCiphertext>(
-        &self,
-        ct: &mut T,
-        streams: &CudaStreams,
-    ) {
+    pub fn bitnot_assign<T: CudaIntegerRadixCiphertext>(&self, ct: &mut T, streams: &CudaStreams) {
        if !ct.block_carries_are_empty() {
            self.full_propagate_assign(ct, streams);
        }

-        self.unchecked_bitnot_assign_async(ct, streams);
+        self.unchecked_bitnot_assign(ct, streams);
    }

-    pub fn bitnot_assign<T: CudaIntegerRadixCiphertext>(&self, ct: &mut T, streams: &CudaStreams) {
-        unsafe {
-            self.bitnot_assign_async(ct, streams);
-        }
-        streams.synchronize();
-    }
    pub fn get_bitand_size_on_gpu<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &T,
@@ -910,6 +829,7 @@ impl CudaServerKey {
    ) -> u64 {
        self.get_bitop_size_on_gpu(ct_left, ct_right, BitOpType::And, streams)
    }
+
    pub fn get_bitor_size_on_gpu<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &T,
@@ -918,6 +838,7 @@ impl CudaServerKey {
    ) -> u64 {
        self.get_bitop_size_on_gpu(ct_left, ct_right, BitOpType::Or, streams)
    }
+
    pub fn get_bitxor_size_on_gpu<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &T,
--- a/tfhe/src/integer/gpu/server_key/radix/cmux.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/cmux.rs
@@ -9,11 +9,7 @@ use crate::integer::gpu::{
 };

 impl CudaServerKey {
-    /// # Safety
-    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
-    pub unsafe fn unchecked_if_then_else_async<T: CudaIntegerRadixCiphertext>(
+    pub fn unchecked_if_then_else<T: CudaIntegerRadixCiphertext>(
        &self,
        condition: &CudaBooleanBlock,
        true_ct: &T,
@@ -89,19 +85,6 @@ impl CudaServerKey {
        result
    }

-    pub fn unchecked_if_then_else<T: CudaIntegerRadixCiphertext>(
-        &self,
-        condition: &CudaBooleanBlock,
-        true_ct: &T,
-        false_ct: &T,
-        stream: &CudaStreams,
-    ) -> T {
-        let result =
-            unsafe { self.unchecked_if_then_else_async(condition, true_ct, false_ct, stream) };
-        stream.synchronize();
-        result
-    }
-
    pub fn if_then_else<T: CudaIntegerRadixCiphertext>(
        &self,
        condition: &CudaBooleanBlock,
@@ -130,6 +113,7 @@ impl CudaServerKey {

        self.unchecked_if_then_else(condition, true_ct, false_ct, stream)
    }
+
    pub fn get_if_then_else_size_on_gpu<T: CudaIntegerRadixCiphertext>(
        &self,
        _condition: &CudaBooleanBlock,
--- a/tfhe/src/integer/gpu/server_key/radix/comparison.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/comparison.rs
--- a/tfhe/src/integer/gpu/server_key/radix/neg.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/neg.rs
@@ -51,33 +51,20 @@ impl CudaServerKey {
        &self,
        ctxt: &T,
        streams: &CudaStreams,
-    ) -> T {
-        let result = unsafe { self.unchecked_neg_async(ctxt, streams) };
-        streams.synchronize();
-        result
-    }
-
-    /// # Safety
-    ///
-    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronised
-    pub unsafe fn unchecked_neg_async<T: CudaIntegerRadixCiphertext>(
-        &self,
-        ctxt: &T,
-        streams: &CudaStreams,
    ) -> T {
        let mut ciphertext_out = ctxt.duplicate(streams);

        let info = ctxt.as_ref().info.blocks.first().unwrap();

-        cuda_backend_unchecked_negate(
-            streams,
-            ciphertext_out.as_mut(),
-            ctxt.as_ref(),
-            info.message_modulus.0 as u32,
-            info.carry_modulus.0 as u32,
-        );
-
+        unsafe {
+            cuda_backend_unchecked_negate(
+                streams,
+                ciphertext_out.as_mut(),
+                ctxt.as_ref(),
+                info.message_modulus.0 as u32,
+                info.carry_modulus.0 as u32,
+            );
+        }
        ciphertext_out
    }

@@ -121,28 +108,6 @@ impl CudaServerKey {
    /// assert_eq!(modulus - msg, dec);
    /// ```
    pub fn neg<T: CudaIntegerRadixCiphertext>(&self, ctxt: &T, streams: &CudaStreams) -> T {
-        let result = unsafe { self.neg_async(ctxt, streams) };
-        streams.synchronize();
-        result
-    }
-
-    pub fn get_neg_size_on_gpu<T: CudaIntegerRadixCiphertext>(
-        &self,
-        ctxt: &T,
-        streams: &CudaStreams,
-    ) -> u64 {
-        self.get_scalar_add_size_on_gpu(ctxt, streams)
-    }
-
-    /// # Safety
-    ///
-    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronized
-    pub unsafe fn neg_async<T: CudaIntegerRadixCiphertext>(
-        &self,
-        ctxt: &T,
-        streams: &CudaStreams,
-    ) -> T {
        let mut tmp_ctxt;

        let ct = if ctxt.block_carries_are_empty() {
@@ -153,20 +118,20 @@ impl CudaServerKey {
            &mut tmp_ctxt
        };

-        let mut res = self.unchecked_neg_async(ct, streams);
+        let mut res = self.unchecked_neg(ct, streams);
        let _carry = self.propagate_single_carry_assign(&mut res, streams, None, OutputFlag::None);
        res
    }

-    /// # Safety
-    ///
-    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronized
-    pub unsafe fn overflowing_neg_async<T>(
+    pub fn get_neg_size_on_gpu<T: CudaIntegerRadixCiphertext>(
        &self,
        ctxt: &T,
        streams: &CudaStreams,
-    ) -> (T, CudaBooleanBlock)
+    ) -> u64 {
+        self.get_scalar_add_size_on_gpu(ctxt, streams)
+    }
+
+    pub fn overflowing_neg<T>(&self, ctxt: &T, streams: &CudaStreams) -> (T, CudaBooleanBlock)
    where
        T: CudaIntegerRadixCiphertext,
    {
@@ -178,7 +143,7 @@ impl CudaServerKey {
            ct
        };

-        self.bitnot_assign_async(&mut ct, streams);
+        self.bitnot_assign(&mut ct, streams);

        if T::IS_SIGNED {
            let tmp = CudaSignedRadixCiphertext {
@@ -192,18 +157,9 @@ impl CudaServerKey {
                ciphertext: ct.into_inner(),
            };
            let mut overflowed = self.unsigned_overflowing_scalar_add_assign(&mut tmp, 1, streams);
-            self.unchecked_boolean_bitnot_assign_async(&mut overflowed, streams);
+            self.unchecked_boolean_bitnot_assign(&mut overflowed, streams);
            let result = T::from(tmp.into_inner());
            (result, overflowed)
        }
    }
-
-    pub fn overflowing_neg<T>(&self, ctxt: &T, streams: &CudaStreams) -> (T, CudaBooleanBlock)
-    where
-        T: CudaIntegerRadixCiphertext,
-    {
-        let result = unsafe { self.overflowing_neg_async(ctxt, streams) };
-        streams.synchronize();
-        result
-    }
 }
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs
@@ -297,7 +297,7 @@ impl CudaServerKey {
                )
            } else {
                let scalar_as_trivial = self.create_trivial_radix(scalar, num_blocks, streams);
-                self.unchecked_comparison_async(ct, &scalar_as_trivial, op, streams)
+                self.unchecked_comparison(ct, &scalar_as_trivial, op, streams)
            }
        } else {
            // Unsigned