refactor(core): start refactoring pbs code

2026-01-11 07:38:08 -05:00 · 2023-10-13 10:29:25 +02:00
32 changed files with 2173 additions and 2235 deletions
--- a/tfhe/src/boolean/engine/bootstrapping.rs
+++ b/tfhe/src/boolean/engine/bootstrapping.rs
@@ -3,10 +3,10 @@ use crate::boolean::{ClientKey, PLAINTEXT_TRUE};
 use crate::core_crypto::algorithms::*;
 use crate::core_crypto::commons::computation_buffers::ComputationBuffers;
 use crate::core_crypto::commons::generators::{DeterministicSeeder, EncryptionRandomGenerator};
+use crate::core_crypto::commons::math::fft64::Fft;
 use crate::core_crypto::commons::math::random::{ActivatedRandomGenerator, Seeder};
 use crate::core_crypto::commons::parameters::{CiphertextModulus, PBSOrder};
 use crate::core_crypto::entities::*;
-use crate::core_crypto::fft_impl::fft64::math::fft::Fft;
 use serde::{Deserialize, Serialize};
 use std::error::Error;

--- a/tfhe/src/core_crypto/algorithms/ggsw_conversion.rs
+++ b/tfhe/src/core_crypto/algorithms/ggsw_conversion.rs
@@ -3,14 +3,14 @@
 //! like the Fourier domain.

 use crate::core_crypto::commons::computation_buffers::ComputationBuffers;
+use crate::core_crypto::commons::math::fft64::{Fft, FftView};
 use crate::core_crypto::commons::traits::*;
+use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::entities::fourier_ggsw_ciphertext::FourierGgswCiphertextMutView;
 use crate::core_crypto::entities::*;
-use crate::core_crypto::fft_impl::fft64::crypto::ggsw::{
-    fill_with_forward_fourier_scratch, FourierGgswCiphertext,
-};
-use crate::core_crypto::fft_impl::fft64::math::fft::{Fft, FftView};
+use crate::core_crypto::prelude::fourier_polynomial::FourierPolynomialMutView;
 use concrete_fft::c64;
-use dyn_stack::{PodStack, SizeOverflow, StackReq};
+use dyn_stack::{PodStack, ReborrowMut, SizeOverflow, StackReq};

 /// Convert a [`GGSW ciphertext`](`GgswCiphertext`) with standard coefficients to the Fourier
 /// domain.
@@ -57,14 +57,33 @@ pub fn convert_standard_ggsw_ciphertext_to_fourier_mem_optimized<Scalar, InputCo
    InputCont: Container<Element = Scalar>,
    OutputCont: ContainerMut<Element = c64>,
 {
-    output_ggsw
-        .as_mut_view()
-        .fill_with_forward_fourier(input_ggsw.as_view(), fft, stack);
+    fn implementation<Scalar: UnsignedTorus>(
+        coef_ggsw: GgswCiphertextView<'_, Scalar>,
+        fourier_ggsw: FourierGgswCiphertextMutView<'_>,
+        fft: FftView<'_>,
+        mut stack: PodStack<'_>,
+    ) {
+        debug_assert_eq!(coef_ggsw.polynomial_size(), fourier_ggsw.polynomial_size());
+        let fourier_poly_size = coef_ggsw.polynomial_size().to_fourier_polynomial_size().0;
+
+        for (fourier_poly, coef_poly) in izip!(
+            fourier_ggsw.data().into_chunks(fourier_poly_size),
+            coef_ggsw.as_polynomial_list().iter()
+        ) {
+            fft.forward_as_torus(
+                FourierPolynomialMutView::from_container(fourier_poly),
+                coef_poly,
+                stack.rb_mut(),
+            );
+        }
+    }
+
+    implementation(input_ggsw.as_view(), output_ggsw.as_mut_view(), fft, stack);
 }

 /// Return the required memory for [`convert_standard_ggsw_ciphertext_to_fourier_mem_optimized`].
 pub fn convert_standard_ggsw_ciphertext_to_fourier_mem_optimized_requirement(
    fft: FftView<'_>,
 ) -> Result<StackReq, SizeOverflow> {
-    fill_with_forward_fourier_scratch(fft)
+    fft.forward_scratch()
 }
--- a/tfhe/src/core_crypto/algorithms/lwe_bootstrap_key_conversion.rs
+++ b/tfhe/src/core_crypto/algorithms/lwe_bootstrap_key_conversion.rs
@@ -3,14 +3,12 @@
 //! like the Fourier domain.

 use crate::core_crypto::commons::computation_buffers::ComputationBuffers;
+use crate::core_crypto::commons::math::fft64::{Fft, FftView};
 use crate::core_crypto::commons::traits::*;
+use crate::core_crypto::entities::fourier_lwe_bootstrap_key::FourierLweBootstrapKey;
 use crate::core_crypto::entities::*;
 use crate::core_crypto::fft_impl::fft128::crypto::bootstrap::Fourier128LweBootstrapKey;
 use crate::core_crypto::fft_impl::fft128::math::fft::Fft128;
-use crate::core_crypto::fft_impl::fft64::crypto::bootstrap::{
-    fill_with_forward_fourier_scratch, FourierLweBootstrapKey,
-};
-use crate::core_crypto::fft_impl::fft64::math::fft::{Fft, FftView};
 use concrete_fft::c64;
 use dyn_stack::{PodStack, SizeOverflow, StackReq};

@@ -154,7 +152,7 @@ pub fn par_convert_standard_lwe_bootstrap_key_to_fourier<Scalar, InputCont, Outp
 pub fn convert_standard_lwe_bootstrap_key_to_fourier_mem_optimized_requirement(
    fft: FftView<'_>,
 ) -> Result<StackReq, SizeOverflow> {
-    fill_with_forward_fourier_scratch(fft)
+    fft.forward_scratch()
 }

 /// Convert an [`LWE bootstrap key`](`LweBootstrapKey`) with standard coefficients to the Fourier
--- a/tfhe/src/core_crypto/algorithms/lwe_multi_bit_bootstrap_key_conversion.rs
+++ b/tfhe/src/core_crypto/algorithms/lwe_multi_bit_bootstrap_key_conversion.rs
@@ -3,11 +3,11 @@
 //! representations/numerical domains like the Fourier domain.

 use crate::core_crypto::commons::computation_buffers::ComputationBuffers;
-use crate::core_crypto::commons::traits::*;
-use crate::core_crypto::entities::*;
-use crate::core_crypto::fft_impl::fft64::math::fft::{
+use crate::core_crypto::commons::math::fft64::{
    par_convert_polynomials_list_to_fourier, Fft, FftView,
 };
+use crate::core_crypto::commons::traits::*;
+use crate::core_crypto::entities::*;
 use concrete_fft::c64;
 use dyn_stack::{PodStack, ReborrowMut, SizeOverflow, StackReq};

@@ -56,7 +56,7 @@ pub fn convert_standard_lwe_multi_bit_bootstrap_key_to_fourier_mem_optimized<
    InputCont: Container<Element = Scalar>,
    OutputCont: ContainerMut<Element = c64>,
 {
-    let mut output_bsk_as_polynomial_list = output_bsk.as_mut_polynomial_list();
+    let output_bsk_as_polynomial_list = output_bsk.as_mut_polynomial_list();
    let input_bsk_as_polynomial_list = input_bsk.as_polynomial_list();

    assert_eq!(
@@ -65,7 +65,7 @@ pub fn convert_standard_lwe_multi_bit_bootstrap_key_to_fourier_mem_optimized<
    );

    for (fourier_poly, coef_poly) in output_bsk_as_polynomial_list
-        .iter_mut()
+        .into_polynomial_iter()
        .zip(input_bsk_as_polynomial_list.iter())
    {
        // SAFETY: forward_as_torus doesn't write any uninitialized values into its output
--- a/tfhe/src/core_crypto/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
@@ -3,14 +3,16 @@ use crate::core_crypto::algorithms::polynomial_algorithms::*;
 use crate::core_crypto::algorithms::slice_algorithms::*;
 use crate::core_crypto::commons::computation_buffers::ComputationBuffers;
 use crate::core_crypto::commons::math::decomposition::SignedDecomposer;
+use crate::core_crypto::commons::math::fft64::{update_with_fmadd_factor, Fft, FftView};
 use crate::core_crypto::commons::parameters::*;
 use crate::core_crypto::commons::traits::*;
 use crate::core_crypto::entities::*;
 use crate::core_crypto::fft_impl::common::pbs_modulus_switch;
-use crate::core_crypto::fft_impl::fft64::crypto::ggsw::{
-    add_external_product_assign, add_external_product_assign_scratch, update_with_fmadd_factor,
+use crate::core_crypto::prelude::{
+    add_external_product_assign_mem_optimized,
+    add_external_product_assign_mem_optimized_requirement,
+    convert_standard_ggsw_ciphertext_to_fourier_mem_optimized,
 };
-use crate::core_crypto::fft_impl::fft64::math::fft::{Fft, FftView};
 use concrete_fft::c64;
 use std::sync::{mpsc, Condvar, Mutex};
 use std::thread;
@@ -75,7 +77,7 @@ pub fn prepare_multi_bit_ggsw_mem_optimized<
        update_with_fmadd_factor(
            multi_bit_fourier_ggsw,
            fourier_ggsw.as_view().data(),
-            fourier_a_monomial.as_view().data,
+            fourier_a_monomial.as_ref(),
            factor,
            false,
            polynomial_size.to_fourier_polynomial_size().0,
@@ -475,7 +477,7 @@ pub fn multi_bit_blind_rotate_assign<Scalar, InputCont, OutputCont, KeyCont>(
        let mut buffers = ComputationBuffers::new();

        buffers.resize(
-            add_external_product_assign_scratch::<Scalar>(
+            add_external_product_assign_mem_optimized_requirement::<Scalar>(
                multi_bit_bsk.glwe_size(),
                multi_bit_bsk.polynomial_size(),
                fft,
@@ -504,10 +506,10 @@ pub fn multi_bit_blind_rotate_assign<Scalar, InputCont, OutputCont, KeyCont>(
            assert!(*ready);

            let multi_bit_fourier_ggsw = multi_bit_fourier_ggsw.lock().unwrap();
-            add_external_product_assign(
-                dst_ct,
-                multi_bit_fourier_ggsw.as_view(),
-                src_ct,
+            add_external_product_assign_mem_optimized(
+                &mut dst_ct,
+                &multi_bit_fourier_ggsw,
+                &src_ct,
                fft,
                buffers.stack(),
            );
@@ -714,7 +716,7 @@ pub fn multi_bit_deterministic_blind_rotate_assign<Scalar, InputCont, OutputCont
                    update_with_fmadd_factor(
                        multi_bit_fourier_ggsw,
                        fourier_ggsw.as_view().data(),
-                        fourier_a_monomial.as_view().data,
+                        fourier_a_monomial.as_ref(),
                        factor,
                        false,
                        lut_poly_size.to_fourier_polynomial_size().0,
@@ -751,7 +753,7 @@ pub fn multi_bit_deterministic_blind_rotate_assign<Scalar, InputCont, OutputCont
        let fft = fft.as_view();

        buffers.resize(
-            add_external_product_assign_scratch::<Scalar>(
+            add_external_product_assign_mem_optimized_requirement::<Scalar>(
                multi_bit_bsk.glwe_size(),
                multi_bit_bsk.polynomial_size(),
                fft,
@@ -785,10 +787,10 @@ pub fn multi_bit_deterministic_blind_rotate_assign<Scalar, InputCont, OutputCont

            let multi_bit_fourier_ggsw = multi_bit_fourier_ggsw.lock().unwrap();

-            add_external_product_assign(
-                dst_ct,
-                multi_bit_fourier_ggsw.as_view(),
-                src_ct,
+            add_external_product_assign_mem_optimized(
+                &mut dst_ct,
+                &multi_bit_fourier_ggsw,
+                &src_ct,
                fft,
                buffers.stack(),
            );
@@ -1456,8 +1458,9 @@ pub fn std_multi_bit_blind_rotate_assign<Scalar, InputCont, OutputCont, KeyCont>
                    lwe_mask_elements,
                );

-                fourier_ggsw_buffer.as_mut_view().fill_with_forward_fourier(
-                    std_ggsw_buffer.as_view(),
+                convert_standard_ggsw_ciphertext_to_fourier_mem_optimized(
+                    &std_ggsw_buffer,
+                    &mut fourier_ggsw_buffer,
                    fft,
                    buffers.stack(),
                );
@@ -1493,7 +1496,7 @@ pub fn std_multi_bit_blind_rotate_assign<Scalar, InputCont, OutputCont, KeyCont>
        let mut buffers = ComputationBuffers::new();

        buffers.resize(
-            add_external_product_assign_scratch::<Scalar>(
+            add_external_product_assign_mem_optimized_requirement::<Scalar>(
                multi_bit_bsk.glwe_size(),
                multi_bit_bsk.polynomial_size(),
                fft,
@@ -1522,10 +1525,10 @@ pub fn std_multi_bit_blind_rotate_assign<Scalar, InputCont, OutputCont, KeyCont>
            assert!(*ready);

            let multi_bit_fourier_ggsw = multi_bit_fourier_ggsw.lock().unwrap();
-            add_external_product_assign(
-                dst_ct,
-                multi_bit_fourier_ggsw.as_view(),
-                src_ct,
+            add_external_product_assign_mem_optimized(
+                &mut dst_ct,
+                &multi_bit_fourier_ggsw,
+                &src_ct,
                fft,
                buffers.stack(),
            );
@@ -1728,8 +1731,9 @@ pub fn std_multi_bit_deterministic_blind_rotate_assign<Scalar, InputCont, Output
                    lwe_mask_elements,
                );

-                fourier_ggsw_buffer.as_mut_view().fill_with_forward_fourier(
-                    std_ggsw_buffer.as_view(),
+                convert_standard_ggsw_ciphertext_to_fourier_mem_optimized(
+                    &std_ggsw_buffer,
+                    &mut fourier_ggsw_buffer,
                    fft,
                    buffers.stack(),
                );
@@ -1761,7 +1765,7 @@ pub fn std_multi_bit_deterministic_blind_rotate_assign<Scalar, InputCont, Output
        let mut buffers = ComputationBuffers::new();

        buffers.resize(
-            add_external_product_assign_scratch::<Scalar>(
+            add_external_product_assign_mem_optimized_requirement::<Scalar>(
                multi_bit_bsk.glwe_size(),
                multi_bit_bsk.polynomial_size(),
                fft,
@@ -1795,10 +1799,10 @@ pub fn std_multi_bit_deterministic_blind_rotate_assign<Scalar, InputCont, Output

            let multi_bit_fourier_ggsw = multi_bit_fourier_ggsw.lock().unwrap();

-            add_external_product_assign(
-                dst_ct,
-                multi_bit_fourier_ggsw.as_view(),
-                src_ct,
+            add_external_product_assign_mem_optimized(
+                &mut dst_ct,
+                &multi_bit_fourier_ggsw,
+                &src_ct,
                fft,
                buffers.stack(),
            );
--- a/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping.rs
@@ -2,26 +2,26 @@
 //! bootstrap`](`LweBootstrapKey#programmable-bootstrapping`).

 use crate::core_crypto::commons::computation_buffers::ComputationBuffers;
-use crate::core_crypto::commons::math::decomposition::SignedDecomposer;
+use crate::core_crypto::commons::math::decomposition::{
+    SignedDecomposer, TensorSignedDecompositionLendingIter,
+};
+use crate::core_crypto::commons::math::fft64::{update_with_fmadd, Fft, FftView};
 use crate::core_crypto::commons::parameters::*;
 use crate::core_crypto::commons::traits::*;
+use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::entities::fourier_lwe_bootstrap_key::FourierLweBootstrapKey;
+use crate::core_crypto::entities::fourier_polynomial::FourierPolynomialMutView;
 use crate::core_crypto::entities::*;
+use crate::core_crypto::fft_impl::common::FourierBootstrapKey;
 use crate::core_crypto::fft_impl::fft128::crypto::bootstrap::{
    bootstrap_scratch as bootstrap_scratch_f128, Fourier128LweBootstrapKey,
 };
 use crate::core_crypto::fft_impl::fft128::math::fft::{Fft128, Fft128View};
-use crate::core_crypto::fft_impl::fft64::crypto::bootstrap::{
-    bootstrap_scratch, FourierLweBootstrapKey,
-};
-use crate::core_crypto::fft_impl::fft64::crypto::ggsw::{
-    add_external_product_assign as impl_add_external_product_assign,
-    add_external_product_assign_scratch as impl_add_external_product_assign_scratch, cmux,
-    cmux_scratch, FourierGgswCiphertext,
-};
-use crate::core_crypto::fft_impl::fft64::crypto::wop_pbs::blind_rotate_assign_scratch;
-use crate::core_crypto::fft_impl::fft64::math::fft::{Fft, FftView};
+use aligned_vec::CACHELINE_ALIGN;
 use concrete_fft::c64;
-use dyn_stack::{PodStack, SizeOverflow, StackReq};
+use dyn_stack::{PodStack, ReborrowMut, SizeOverflow, StackReq};
+
+use super::ggsw_conversion::convert_standard_ggsw_ciphertext_to_fourier_mem_optimized_requirement;

 /// Perform a blind rotation given an input [`LWE ciphertext`](`LweCiphertext`), modifying a look-up
 /// table passed as a [`GLWE ciphertext`](`GlweCiphertext`) and an [`LWE bootstrap
@@ -293,7 +293,20 @@ pub fn blind_rotate_assign_mem_optimized_requirement<Scalar>(
    polynomial_size: PolynomialSize,
    fft: FftView<'_>,
 ) -> Result<StackReq, SizeOverflow> {
-    blind_rotate_assign_scratch::<Scalar>(glwe_size, polynomial_size, fft)
+    StackReq::try_any_of([
+        // tmp_poly allocation
+        StackReq::try_new_aligned::<Scalar>(polynomial_size.0, CACHELINE_ALIGN)?,
+        StackReq::try_all_of([
+            // ct1 allocation
+            StackReq::try_new_aligned::<Scalar>(glwe_size.0 * polynomial_size.0, CACHELINE_ALIGN)?,
+            // external product
+            add_external_product_assign_mem_optimized_requirement::<Scalar>(
+                glwe_size,
+                polynomial_size,
+                fft,
+            )?,
+        ])?,
+    ])
 }

 /// Compute the external product of `ggsw` and `glwe`, and add the result to `out`.
@@ -484,31 +497,160 @@ pub fn add_external_product_assign_mem_optimized<Scalar, OutputGlweCont, InputGl
    GgswCont: Container<Element = c64>,
    InputGlweCont: Container<Element = Scalar>,
 {
-    assert_eq!(out.ciphertext_modulus(), glwe.ciphertext_modulus());
-    let ciphertext_modulus = out.ciphertext_modulus();
-    assert!(ciphertext_modulus.is_compatible_with_native_modulus());
+    fn implementation<Scalar: UnsignedTorus>(
+        mut out: GlweCiphertextMutView<'_, Scalar>,
+        ggsw: FourierGgswCiphertext<&[c64]>,
+        glwe: GlweCiphertext<&[Scalar]>,
+        fft: FftView<'_>,
+        stack: PodStack<'_>,
+    ) {
+        assert_eq!(out.ciphertext_modulus(), glwe.ciphertext_modulus());
+        let ciphertext_modulus = out.ciphertext_modulus();
+        assert!(ciphertext_modulus.is_compatible_with_native_modulus());

-    impl_add_external_product_assign(
+        // we check that the polynomial sizes match
+        assert_eq!(ggsw.polynomial_size(), glwe.polynomial_size());
+        assert_eq!(ggsw.polynomial_size(), out.polynomial_size());
+        // we check that the glwe sizes match
+        assert_eq!(ggsw.glwe_size(), glwe.glwe_size());
+        assert_eq!(ggsw.glwe_size(), out.glwe_size());
+
+        let align = CACHELINE_ALIGN;
+        let fourier_poly_size = ggsw.polynomial_size().to_fourier_polynomial_size().0;
+
+        // we round the input mask and body
+        let decomposer = SignedDecomposer::<Scalar>::new(
+            ggsw.decomposition_base_log(),
+            ggsw.decomposition_level_count(),
+        );
+
+        let (mut output_fft_buffer, mut substack0) =
+            stack.make_aligned_raw::<c64>(fourier_poly_size * ggsw.glwe_size().0, align);
+        // output_fft_buffer is initially uninitialized, considered to be implicitly zero, to avoid
+        // the cost of filling it up with zeros. `is_output_uninit` is set to `false` once
+        // it has been fully initialized for the first time.
+        let output_fft_buffer = &mut *output_fft_buffer;
+        let mut is_output_uninit = true;
+
+        {
+            // ------------------------------------------------------ EXTERNAL PRODUCT IN FOURIER
+            // DOMAIN In this section, we perform the external product in the fourier
+            // domain, and accumulate the result in the output_fft_buffer variable.
+            let glwe_len = glwe.as_ref().len();
+            let (mut states, mut substack1) = substack0
+                .rb_mut()
+                .make_aligned_raw::<Scalar>(glwe_len, CACHELINE_ALIGN);
+            let mut decomposition = TensorSignedDecompositionLendingIter::new(
+                glwe.as_ref()
+                    .iter()
+                    .map(|s| decomposer.closest_representable(*s)),
+                DecompositionBaseLog(decomposer.base_log),
+                DecompositionLevelCount(decomposer.level_count),
+                &mut states,
+            );
+
+            // We loop through the levels (we reverse to match the order of the decomposition
+            // iterator.)
+            ggsw.into_levels().rev().for_each(|ggsw_decomp_matrix| {
+                // We retrieve the decomposition of this level.
+                let (mut glwe_decomp_term, mut substack2) = substack1
+                    .rb_mut()
+                    .make_aligned_raw::<Scalar>(glwe_len, CACHELINE_ALIGN);
+                let glwe_level = decomposition
+                    .fill_next_term(&mut glwe_decomp_term)
+                    .unwrap()
+                    .0;
+                let glwe_decomp_term = GlweCiphertextView::from_container(
+                    &*glwe_decomp_term,
+                    ggsw.polynomial_size(),
+                    out.ciphertext_modulus(),
+                );
+                debug_assert_eq!(ggsw_decomp_matrix.decomposition_level(), glwe_level);
+
+                // For each level we have to add the result of the vector-matrix product between the
+                // decomposition of the glwe, and the ggsw level matrix to the output. To do so, we
+                // iteratively add to the output, the product between every line of the matrix, and
+                // the corresponding (scalar) polynomial in the glwe decomposition:
+                //
+                //                ggsw_mat                        ggsw_mat
+                //   glwe_dec   | - - - - | <        glwe_dec   | - - - - |
+                //  | - - - | x | - - - - |         | - - - | x | - - - - | <
+                //    ^         | - - - - |             ^       | - - - - |
+                //
+                //        t = 1                           t = 2                     ...
+
+                izip!(
+                    ggsw_decomp_matrix.into_rows(),
+                    glwe_decomp_term.as_polynomial_list().iter()
+                )
+                .for_each(|(ggsw_row, glwe_poly)| {
+                    let (mut fourier, substack3) = substack2
+                        .rb_mut()
+                        .make_aligned_raw::<c64>(fourier_poly_size, align);
+                    // We perform the forward fft transform for the glwe polynomial
+                    fft.forward_as_integer(
+                        FourierPolynomialMutView::from_container(&mut fourier),
+                        glwe_poly,
+                        substack3,
+                    );
+                    // Now we loop through the polynomials of the output, and add the
+                    // corresponding product of polynomials.
+
+                    update_with_fmadd(
+                        output_fft_buffer,
+                        ggsw_row.data(),
+                        &fourier,
+                        is_output_uninit,
+                        fourier_poly_size,
+                    );
+
+                    // we initialized `output_fft_buffer, so we can set this to false
+                    is_output_uninit = false;
+                });
+            });
+        }
+
+        // --------------------------------------------  TRANSFORMATION OF RESULT TO STANDARD DOMAIN
+        // In this section, we bring the result from the fourier domain, back to the standard
+        // domain, and add it to the output.
+        //
+        // We iterate over the polynomials in the output.
+        if !is_output_uninit {
+            izip!(
+                out.as_mut_polynomial_list().iter_mut(),
+                output_fft_buffer
+                    .into_chunks(fourier_poly_size)
+                    .map(FourierPolynomialMutView::from_container),
+            )
+            .for_each(|(out, fourier)| {
+                // The fourier buffer is not re-used afterwards so we can use the in-place version
+                // of the add_backward_as_torus function
+                fft.add_backward_in_place_as_torus(out, fourier, substack0.rb_mut());
+            });
+        }
+
+        if !ciphertext_modulus.is_native_modulus() {
+            // When we convert back from the fourier domain, integer values will contain up to 53
+            // MSBs with information. In our representation of power of 2 moduli < native modulus we
+            // fill the MSBs and leave the LSBs empty, this usage of the signed decomposer allows to
+            // round while keeping the data in the MSBs
+            let signed_decomposer = SignedDecomposer::new(
+                DecompositionBaseLog(ciphertext_modulus.get_custom_modulus().ilog2() as usize),
+                DecompositionLevelCount(1),
+            );
+            out.as_mut()
+                .iter_mut()
+                .for_each(|x| *x = signed_decomposer.closest_representable(*x));
+        }
+    }
+
+    implementation(
        out.as_mut_view(),
        ggsw.as_view(),
        glwe.as_view(),
        fft,
        stack,
    );
-
-    if !ciphertext_modulus.is_native_modulus() {
-        // When we convert back from the fourier domain, integer values will contain up to 53
-        // MSBs with information. In our representation of power of 2 moduli < native modulus we
-        // fill the MSBs and leave the LSBs empty, this usage of the signed decomposer allows to
-        // round while keeping the data in the MSBs
-        let signed_decomposer = SignedDecomposer::new(
-            DecompositionBaseLog(ciphertext_modulus.get_custom_modulus().ilog2() as usize),
-            DecompositionLevelCount(1),
-        );
-        out.as_mut()
-            .iter_mut()
-            .for_each(|x| *x = signed_decomposer.closest_representable(*x));
-    }
 }

 /// Return the required memory for [`add_external_product_assign_mem_optimized`].
@@ -517,7 +659,22 @@ pub fn add_external_product_assign_mem_optimized_requirement<Scalar>(
    polynomial_size: PolynomialSize,
    fft: FftView<'_>,
 ) -> Result<StackReq, SizeOverflow> {
-    impl_add_external_product_assign_scratch::<Scalar>(glwe_size, polynomial_size, fft)
+    let align = CACHELINE_ALIGN;
+    let standard_scratch =
+        StackReq::try_new_aligned::<Scalar>(glwe_size.0 * polynomial_size.0, align)?;
+    let fourier_polynomial_size = polynomial_size.to_fourier_polynomial_size().0;
+    let fourier_scratch =
+        StackReq::try_new_aligned::<c64>(glwe_size.0 * fourier_polynomial_size, align)?;
+    let fourier_scratch_single = StackReq::try_new_aligned::<c64>(fourier_polynomial_size, align)?;
+
+    let substack3 = fft.forward_scratch()?;
+    let substack2 = substack3.try_and(fourier_scratch_single)?;
+    let substack1 = substack2.try_and(standard_scratch)?;
+    let substack0 = StackReq::try_any_of([
+        substack1.try_and(standard_scratch)?,
+        fft.backward_scratch()?,
+    ])?;
+    substack0.try_and(fourier_scratch)
 }

 /// Compute a cmux on the input `ct0` and `ct1` using `ggsw` as selector.
@@ -774,31 +931,13 @@ pub fn cmux_assign_mem_optimized<Scalar, Cont0, Cont1, GgswCont>(
    Cont1: ContainerMut<Element = Scalar>,
    GgswCont: Container<Element = c64>,
 {
-    assert_eq!(ct0.ciphertext_modulus(), ct1.ciphertext_modulus());
    let ciphertext_modulus = ct0.ciphertext_modulus();
+    assert_eq!(ct0.ciphertext_modulus(), ct1.ciphertext_modulus());
    assert!(ciphertext_modulus.is_compatible_with_native_modulus());
-
-    cmux(
-        ct0.as_mut_view(),
-        ct1.as_mut_view(),
-        ggsw.as_view(),
-        fft,
-        stack,
-    );
-
-    if !ciphertext_modulus.is_native_modulus() {
-        // When we convert back from the fourier domain, integer values will contain up to 53
-        // MSBs with information. In our representation of power of 2 moduli < native modulus we
-        // fill the MSBs and leave the LSBs empty, this usage of the signed decomposer allows to
-        // round while keeping the data in the MSBs
-        let signed_decomposer = SignedDecomposer::new(
-            DecompositionBaseLog(ciphertext_modulus.get_custom_modulus().ilog2() as usize),
-            DecompositionLevelCount(1),
-        );
-        ct0.as_mut()
-            .iter_mut()
-            .for_each(|x| *x = signed_decomposer.closest_representable(*x));
-    }
+    izip!(ct1.as_mut(), ct0.as_ref(),).for_each(|(c1, c0)| {
+        *c1 = c1.wrapping_sub(*c0);
+    });
+    add_external_product_assign_mem_optimized(ct0, ggsw, ct1, fft, stack);
 }

 /// Return the required memory for [`cmux_assign_mem_optimized`].
@@ -807,7 +946,7 @@ pub fn cmux_assign_mem_optimized_requirement<Scalar>(
    polynomial_size: PolynomialSize,
    fft: FftView<'_>,
 ) -> Result<StackReq, SizeOverflow> {
-    cmux_scratch::<Scalar>(glwe_size, polynomial_size, fft)
+    add_external_product_assign_mem_optimized_requirement::<Scalar>(glwe_size, polynomial_size, fft)
 }

 /// Perform a programmable bootstrap given an input [`LWE ciphertext`](`LweCiphertext`), a
@@ -1116,7 +1255,11 @@ pub fn programmable_bootstrap_lwe_ciphertext_mem_optimized_requirement<Scalar>(
    polynomial_size: PolynomialSize,
    fft: FftView<'_>,
 ) -> Result<StackReq, SizeOverflow> {
-    bootstrap_scratch::<Scalar>(glwe_size, polynomial_size, fft)
+    blind_rotate_assign_mem_optimized_requirement::<Scalar>(glwe_size, polynomial_size, fft)?
+        .try_and(StackReq::try_new_aligned::<Scalar>(
+            glwe_size.0 * polynomial_size.0,
+            CACHELINE_ALIGN,
+        )?)
 }

 /// Perform a programmable bootstrap given an input [`LWE ciphertext`](`LweCiphertext`), a
@@ -1406,3 +1549,101 @@ pub fn programmable_bootstrap_f128_lwe_ciphertext_mem_optimized_requirement<Scal
 ) -> Result<StackReq, SizeOverflow> {
    bootstrap_scratch_f128::<Scalar>(glwe_size, polynomial_size, fft)
 }
+
+impl<Scalar> FourierBootstrapKey<Scalar> for FourierLweBootstrapKeyOwned
+where
+    Scalar: UnsignedTorus + CastInto<usize>,
+{
+    type Fft = Fft;
+
+    fn new_fft(polynomial_size: PolynomialSize) -> Self::Fft {
+        Fft::new(polynomial_size)
+    }
+
+    fn new(
+        input_lwe_dimension: LweDimension,
+        polynomial_size: PolynomialSize,
+        glwe_size: GlweSize,
+        decomposition_base_log: DecompositionBaseLog,
+        decomposition_level_count: DecompositionLevelCount,
+    ) -> Self {
+        Self::new(
+            input_lwe_dimension,
+            glwe_size,
+            polynomial_size,
+            decomposition_base_log,
+            decomposition_level_count,
+        )
+    }
+
+    fn fill_with_forward_fourier_scratch(fft: &Self::Fft) -> Result<StackReq, SizeOverflow> {
+        convert_standard_ggsw_ciphertext_to_fourier_mem_optimized_requirement(fft.as_view())
+    }
+
+    fn fill_with_forward_fourier<ContBsk>(
+        &mut self,
+        coef_bsk: &LweBootstrapKey<ContBsk>,
+        fft: &Self::Fft,
+        stack: PodStack<'_>,
+    ) where
+        ContBsk: Container<Element = Scalar>,
+    {
+        self.as_mut_view()
+            .fill_with_forward_fourier(coef_bsk.as_view(), fft.as_view(), stack);
+    }
+
+    fn bootstrap_scratch(
+        glwe_size: GlweSize,
+        polynomial_size: PolynomialSize,
+        fft: &Self::Fft,
+    ) -> Result<StackReq, SizeOverflow> {
+        programmable_bootstrap_lwe_ciphertext_mem_optimized_requirement::<Scalar>(
+            glwe_size,
+            polynomial_size,
+            fft.as_view(),
+        )
+    }
+
+    fn bootstrap<ContLweOut, ContLweIn, ContAcc>(
+        &self,
+        lwe_out: &mut LweCiphertext<ContLweOut>,
+        lwe_in: &LweCiphertext<ContLweIn>,
+        accumulator: &GlweCiphertext<ContAcc>,
+        fft: &Self::Fft,
+        stack: PodStack<'_>,
+    ) where
+        ContLweOut: ContainerMut<Element = Scalar>,
+        ContLweIn: Container<Element = Scalar>,
+        ContAcc: Container<Element = Scalar>,
+    {
+        self.as_view().bootstrap(
+            lwe_out.as_mut_view(),
+            lwe_in.as_view(),
+            accumulator.as_view(),
+            fft.as_view(),
+            stack,
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_crypto::fft_impl::common::tests::test_bootstrap_generic;
+    use crate::core_crypto::prelude::*;
+
+    #[test]
+    fn test_bootstrap_u64() {
+        test_bootstrap_generic::<u64, FourierLweBootstrapKeyOwned>(
+            StandardDev(0.000007069849454709433),
+            StandardDev(0.00000000000000029403601535432533),
+        );
+    }
+
+    #[test]
+    fn test_bootstrap_u32() {
+        test_bootstrap_generic::<u32, FourierLweBootstrapKeyOwned>(
+            StandardDev(0.000007069849454709433),
+            StandardDev(0.00000000000000029403601535432533),
+        );
+    }
+}
--- a/tfhe/src/core_crypto/algorithms/lwe_wopbs.rs
+++ b/tfhe/src/core_crypto/algorithms/lwe_wopbs.rs
@@ -3,19 +3,952 @@
 use crate::core_crypto::algorithms::*;
 use crate::core_crypto::commons::dispersion::DispersionParameter;
 use crate::core_crypto::commons::generators::EncryptionRandomGenerator;
+use crate::core_crypto::commons::math::fft64::FftView;
 use crate::core_crypto::commons::parameters::*;
 use crate::core_crypto::commons::traits::*;
 use crate::core_crypto::entities::*;
-use crate::core_crypto::fft_impl::fft64::crypto::bootstrap::FourierLweBootstrapKey;
-use crate::core_crypto::fft_impl::fft64::crypto::wop_pbs::{
-    circuit_bootstrap_boolean_vertical_packing, circuit_bootstrap_boolean_vertical_packing_scratch,
-    extract_bits, extract_bits_scratch,
-};
-use crate::core_crypto::fft_impl::fft64::math::fft::FftView;
 use concrete_fft::c64;
 use dyn_stack::{PodStack, SizeOverflow, StackReq};
 use rayon::prelude::*;

+mod implementation {
+    #![allow(clippy::too_many_arguments)]
+
+    use crate::core_crypto::algorithms::polynomial_algorithms::*;
+    use crate::core_crypto::algorithms::*;
+    use crate::core_crypto::commons::math::decomposition::DecompositionLevel;
+    use crate::core_crypto::commons::math::fft64::FftView;
+    use crate::core_crypto::commons::numeric::CastInto;
+    use crate::core_crypto::commons::parameters::*;
+    use crate::core_crypto::commons::traits::*;
+    use crate::core_crypto::commons::utils::izip;
+    use crate::core_crypto::entities::*;
+    use aligned_vec::CACHELINE_ALIGN;
+    use concrete_fft::c64;
+    use dyn_stack::{PodStack, ReborrowMut, SizeOverflow, StackReq};
+
+    pub fn extract_bits_scratch<Scalar>(
+        input_lwe_dimension: LweDimension,
+        ksk_after_key_size: LweDimension,
+        glwe_size: GlweSize,
+        polynomial_size: PolynomialSize,
+        fft: FftView<'_>,
+    ) -> Result<StackReq, SizeOverflow> {
+        let align = CACHELINE_ALIGN;
+
+        let lwe_in_buffer =
+            StackReq::try_new_aligned::<Scalar>(input_lwe_dimension.to_lwe_size().0, align)?;
+        let lwe_out_ks_buffer =
+            StackReq::try_new_aligned::<Scalar>(ksk_after_key_size.to_lwe_size().0, align)?;
+        let pbs_accumulator =
+            StackReq::try_new_aligned::<Scalar>(glwe_size.0 * polynomial_size.0, align)?;
+        let lwe_out_pbs_buffer = StackReq::try_new_aligned::<Scalar>(
+            glwe_size
+                .to_glwe_dimension()
+                .to_equivalent_lwe_dimension(polynomial_size)
+                .to_lwe_size()
+                .0,
+            align,
+        )?;
+        let lwe_bit_left_shift_buffer = lwe_in_buffer;
+        let bootstrap_scratch = programmable_bootstrap_lwe_ciphertext_mem_optimized_requirement::<
+            Scalar,
+        >(glwe_size, polynomial_size, fft)?;
+
+        lwe_in_buffer
+            .try_and(lwe_out_ks_buffer)?
+            .try_and(pbs_accumulator)?
+            .try_and(lwe_out_pbs_buffer)?
+            .try_and(StackReq::try_any_of([
+                lwe_bit_left_shift_buffer,
+                bootstrap_scratch,
+            ])?)
+    }
+
+    /// Function to extract `number_of_bits_to_extract` from an [`LweCiphertext`] starting at the
+    /// bit number `delta_log` (0-indexed) included.
+    ///
+    /// Output bits are ordered from the MSB to the LSB. Each one of them is output in a distinct
+    /// LWE ciphertext, containing the encryption of the bit scaled by q/2 (i.e., the most
+    /// significant bit in the plaintext representation).
+    pub fn extract_bits<Scalar: UnsignedTorus + CastInto<usize>>(
+        mut lwe_list_out: LweCiphertextList<&'_ mut [Scalar]>,
+        lwe_in: LweCiphertext<&'_ [Scalar]>,
+        ksk: LweKeyswitchKey<&'_ [Scalar]>,
+        fourier_bsk: FourierLweBootstrapKeyView<'_>,
+        delta_log: DeltaLog,
+        number_of_bits_to_extract: ExtractedBitsCount,
+        fft: FftView<'_>,
+        stack: PodStack<'_>,
+    ) {
+        debug_assert!(lwe_list_out.ciphertext_modulus() == lwe_in.ciphertext_modulus());
+        debug_assert!(lwe_in.ciphertext_modulus() == ksk.ciphertext_modulus());
+        debug_assert!(
+            ksk.ciphertext_modulus().is_native_modulus(),
+            "This operation only supports native moduli"
+        );
+
+        let ciphertext_n_bits = Scalar::BITS;
+        let number_of_bits_to_extract = number_of_bits_to_extract.0;
+
+        debug_assert!(
+            ciphertext_n_bits >= number_of_bits_to_extract + delta_log.0,
+            "Tried to extract {} bits, while the maximum number of extractable bits for {} bits
+        ciphertexts and a scaling factor of 2^{} is {}",
+            number_of_bits_to_extract,
+            ciphertext_n_bits,
+            delta_log.0,
+            ciphertext_n_bits - delta_log.0,
+        );
+        debug_assert!(
+            lwe_list_out.lwe_size().to_lwe_dimension() == ksk.output_key_lwe_dimension(),
+            "lwe_list_out needs to have an lwe_size of {}, got {}",
+            ksk.output_key_lwe_dimension().0,
+            lwe_list_out.lwe_size().to_lwe_dimension().0,
+        );
+        debug_assert!(
+            lwe_list_out.lwe_ciphertext_count().0 == number_of_bits_to_extract,
+            "lwe_list_out needs to have a ciphertext count of {}, got {}",
+            number_of_bits_to_extract,
+            lwe_list_out.lwe_ciphertext_count().0,
+        );
+        debug_assert!(
+            lwe_in.lwe_size() == fourier_bsk.output_lwe_dimension().to_lwe_size(),
+            "lwe_in needs to have an LWE dimension of {}, got {}",
+            fourier_bsk.output_lwe_dimension().to_lwe_size().0,
+            lwe_in.lwe_size().0,
+        );
+        debug_assert!(
+            ksk.output_key_lwe_dimension() == fourier_bsk.input_lwe_dimension(),
+            "ksk needs to have an output LWE dimension of {}, got {}",
+            fourier_bsk.input_lwe_dimension().0,
+            ksk.output_key_lwe_dimension().0,
+        );
+        debug_assert!(lwe_list_out.ciphertext_modulus() == lwe_in.ciphertext_modulus());
+        debug_assert!(lwe_in.ciphertext_modulus() == ksk.ciphertext_modulus());
+
+        let polynomial_size = fourier_bsk.polynomial_size();
+        let glwe_size = fourier_bsk.glwe_size();
+        let glwe_dimension = glwe_size.to_glwe_dimension();
+        let ciphertext_modulus = lwe_in.ciphertext_modulus();
+
+        let align = CACHELINE_ALIGN;
+
+        let (mut lwe_in_buffer_data, stack) =
+            stack.collect_aligned(align, lwe_in.as_ref().iter().copied());
+        let mut lwe_in_buffer =
+            LweCiphertext::from_container(&mut *lwe_in_buffer_data, lwe_in.ciphertext_modulus());
+
+        let (mut lwe_out_ks_buffer_data, stack) =
+            stack.make_aligned_with(ksk.output_lwe_size().0, align, |_| Scalar::ZERO);
+        let mut lwe_out_ks_buffer =
+            LweCiphertext::from_container(&mut *lwe_out_ks_buffer_data, ksk.ciphertext_modulus());
+
+        let (mut pbs_accumulator_data, stack) =
+            stack.make_aligned_with(glwe_size.0 * polynomial_size.0, align, |_| Scalar::ZERO);
+        let mut pbs_accumulator = GlweCiphertextMutView::from_container(
+            &mut *pbs_accumulator_data,
+            polynomial_size,
+            ciphertext_modulus,
+        );
+
+        let lwe_size = glwe_dimension
+            .to_equivalent_lwe_dimension(polynomial_size)
+            .to_lwe_size();
+        let (mut lwe_out_pbs_buffer_data, mut stack) =
+            stack.make_aligned_with(lwe_size.0, align, |_| Scalar::ZERO);
+        let mut lwe_out_pbs_buffer = LweCiphertext::from_container(
+            &mut *lwe_out_pbs_buffer_data,
+            lwe_list_out.ciphertext_modulus(),
+        );
+
+        // We iterate on the list in reverse as we want to store the extracted MSB at index 0
+        for (bit_idx, mut output_ct) in lwe_list_out.iter_mut().rev().enumerate() {
+            // Shift on padding bit
+            let (lwe_bit_left_shift_buffer_data, _) = stack.rb_mut().collect_aligned(
+                align,
+                lwe_in_buffer
+                    .as_ref()
+                    .iter()
+                    .map(|s| *s << (ciphertext_n_bits - delta_log.0 - bit_idx - 1)),
+            );
+
+            // Key switch to input PBS key
+            keyswitch_lwe_ciphertext(
+                &ksk,
+                &LweCiphertext::from_container(
+                    &*lwe_bit_left_shift_buffer_data,
+                    lwe_in.ciphertext_modulus(),
+                ),
+                &mut lwe_out_ks_buffer,
+            );
+
+            drop(lwe_bit_left_shift_buffer_data);
+
+            // Store the keyswitch output unmodified to the output list (as we need to to do other
+            // computations on the output of the keyswitch)
+            output_ct
+                .as_mut()
+                .copy_from_slice(lwe_out_ks_buffer.as_ref());
+
+            // If this was the last extracted bit, break
+            // we subtract 1 because if the number_of_bits_to_extract is 1 we want to stop right
+            // away
+            if bit_idx == number_of_bits_to_extract - 1 {
+                break;
+            }
+
+            // Add q/4 to center the error while computing a negacyclic LUT
+            let out_ks_body = lwe_out_ks_buffer.get_mut_body().data;
+            *out_ks_body = (*out_ks_body).wrapping_add(Scalar::ONE << (ciphertext_n_bits - 2));
+
+            // Fill lut for the current bit (equivalent to trivial encryption as mask is 0s)
+            // The LUT is filled with -alpha in each coefficient where alpha = delta*2^{bit_idx-1}
+            for poly_coeff in &mut pbs_accumulator
+                .as_mut_view()
+                .get_mut_body()
+                .as_mut_polynomial()
+                .iter_mut()
+            {
+                *poly_coeff = Scalar::ZERO.wrapping_sub(Scalar::ONE << (delta_log.0 - 1 + bit_idx));
+            }
+
+            fourier_bsk.bootstrap(
+                lwe_out_pbs_buffer.as_mut_view(),
+                lwe_out_ks_buffer.as_view(),
+                pbs_accumulator.as_view(),
+                fft,
+                stack.rb_mut(),
+            );
+
+            // Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0 if the
+            // extracted bit was 0 and 1 in the other case
+            let out_pbs_body = lwe_out_pbs_buffer.get_mut_body().data;
+
+            *out_pbs_body =
+                (*out_pbs_body).wrapping_add(Scalar::ONE << (delta_log.0 + bit_idx - 1));
+
+            // Remove the extracted bit from the initial LWE to get a 0 at the extracted bit
+            // location.
+            izip!(lwe_in_buffer.as_mut(), lwe_out_pbs_buffer.as_ref())
+                .for_each(|(out, inp)| *out = (*out).wrapping_sub(*inp));
+        }
+    }
+
+    pub fn circuit_bootstrap_boolean_scratch<Scalar>(
+        lwe_in_size: LweSize,
+        bsk_output_lwe_size: LweSize,
+        glwe_size: GlweSize,
+        polynomial_size: PolynomialSize,
+        fft: FftView<'_>,
+    ) -> Result<StackReq, SizeOverflow> {
+        StackReq::try_new_aligned::<Scalar>(bsk_output_lwe_size.0, CACHELINE_ALIGN)?.try_and(
+            homomorphic_shift_boolean_scratch::<Scalar>(
+                lwe_in_size,
+                glwe_size,
+                polynomial_size,
+                fft,
+            )?,
+        )
+    }
+
+    /// Circuit bootstrapping for boolean messages, i.e. containing only one bit of message
+    ///
+    /// The output GGSW ciphertext `ggsw_out` decomposition base log and level count are used as the
+    /// circuit_bootstrap_boolean decomposition base log and level count.
+    pub fn circuit_bootstrap_boolean<Scalar: UnsignedTorus + CastInto<usize>>(
+        fourier_bsk: FourierLweBootstrapKeyView<'_>,
+        lwe_in: LweCiphertext<&[Scalar]>,
+        mut ggsw_out: GgswCiphertext<&mut [Scalar]>,
+        delta_log: DeltaLog,
+        pfpksk_list: LwePrivateFunctionalPackingKeyswitchKeyList<&[Scalar]>,
+        fft: FftView<'_>,
+        stack: PodStack<'_>,
+    ) {
+        debug_assert!(lwe_in.ciphertext_modulus() == ggsw_out.ciphertext_modulus());
+        debug_assert!(ggsw_out.ciphertext_modulus() == pfpksk_list.ciphertext_modulus());
+
+        debug_assert!(
+            pfpksk_list.ciphertext_modulus().is_native_modulus(),
+            "This operation currently only supports native moduli"
+        );
+
+        let level_cbs = ggsw_out.decomposition_level_count();
+        let base_log_cbs = ggsw_out.decomposition_base_log();
+
+        debug_assert!(
+            level_cbs.0 >= 1,
+            "level_cbs needs to be >= 1, got {}",
+            level_cbs.0
+        );
+        debug_assert!(
+            base_log_cbs.0 >= 1,
+            "base_log_cbs needs to be >= 1, got {}",
+            base_log_cbs.0
+        );
+
+        let fpksk_input_lwe_key_dimension = pfpksk_list.input_key_lwe_dimension();
+        let fourier_bsk_output_lwe_dimension = fourier_bsk.output_lwe_dimension();
+
+        debug_assert!(
+            fpksk_input_lwe_key_dimension == fourier_bsk_output_lwe_dimension,
+            "The fourier_bsk output_lwe_dimension, got {}, must be equal to the fpksk \
+        input_lwe_key_dimension, got {}",
+            fourier_bsk_output_lwe_dimension.0,
+            fpksk_input_lwe_key_dimension.0
+        );
+
+        let fpksk_output_polynomial_size = pfpksk_list.output_polynomial_size();
+        let fpksk_output_glwe_key_dimension = pfpksk_list.output_key_glwe_dimension();
+
+        debug_assert!(
+            ggsw_out.polynomial_size() == fpksk_output_polynomial_size,
+            "The output GGSW ciphertext needs to have the same polynomial size as the fpksks, \
+        got {}, expected {}",
+            ggsw_out.polynomial_size().0,
+            fpksk_output_polynomial_size.0
+        );
+
+        debug_assert!(
+            ggsw_out.glwe_size().to_glwe_dimension() == fpksk_output_glwe_key_dimension,
+            "The output GGSW ciphertext needs to have the same GLWE dimension as the fpksks, \
+        got {}, expected {}",
+            ggsw_out.glwe_size().to_glwe_dimension().0,
+            fpksk_output_glwe_key_dimension.0
+        );
+
+        debug_assert!(
+            ggsw_out.glwe_size().0 == pfpksk_list.lwe_pfpksk_count().0,
+            "The input vector of pfpksk_list needs to have {} ggsw.glwe_size elements got {}",
+            ggsw_out.glwe_size().0,
+            pfpksk_list.lwe_pfpksk_count().0,
+        );
+
+        // Output for every bootstrapping
+        let (mut lwe_out_bs_buffer_data, mut stack) = stack.make_aligned_with(
+            fourier_bsk_output_lwe_dimension.to_lwe_size().0,
+            CACHELINE_ALIGN,
+            |_| Scalar::ZERO,
+        );
+        let mut lwe_out_bs_buffer = LweCiphertext::from_container(
+            &mut *lwe_out_bs_buffer_data,
+            lwe_in.ciphertext_modulus(),
+        );
+
+        for (decomposition_level_minus_one, mut ggsw_level_matrix) in
+            ggsw_out.iter_mut().enumerate()
+        {
+            let decomposition_level = DecompositionLevel(decomposition_level_minus_one + 1);
+            homomorphic_shift_boolean(
+                fourier_bsk,
+                lwe_out_bs_buffer.as_mut_view(),
+                lwe_in.as_view(),
+                decomposition_level,
+                base_log_cbs,
+                delta_log,
+                fft,
+                stack.rb_mut(),
+            );
+
+            for (pfpksk, mut glwe_out) in pfpksk_list
+                .iter()
+                .zip(ggsw_level_matrix.as_mut_glwe_list().iter_mut())
+            {
+                private_functional_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
+                    &pfpksk,
+                    &mut glwe_out,
+                    &lwe_out_bs_buffer,
+                );
+            }
+        }
+    }
+
+    pub fn homomorphic_shift_boolean_scratch<Scalar>(
+        lwe_in_size: LweSize,
+        glwe_size: GlweSize,
+        polynomial_size: PolynomialSize,
+        fft: FftView<'_>,
+    ) -> Result<StackReq, SizeOverflow> {
+        let align = CACHELINE_ALIGN;
+        StackReq::try_new_aligned::<Scalar>(lwe_in_size.0, align)?
+            .try_and(StackReq::try_new_aligned::<Scalar>(
+                polynomial_size.0 * glwe_size.0,
+                align,
+            )?)?
+            .try_and(
+                programmable_bootstrap_lwe_ciphertext_mem_optimized_requirement::<Scalar>(
+                    glwe_size,
+                    polynomial_size,
+                    fft,
+                )?,
+            )
+    }
+
+    /// Homomorphic shift for LWE without padding bit
+    ///
+    /// Starts by shifting the message bit at bit #delta_log to the padding bit and then shifts it
+    /// to the right by base_log * level.
+    pub fn homomorphic_shift_boolean<Scalar: UnsignedTorus + CastInto<usize>>(
+        fourier_bsk: FourierLweBootstrapKeyView<'_>,
+        mut lwe_out: LweCiphertext<&mut [Scalar]>,
+        lwe_in: LweCiphertext<&[Scalar]>,
+        level_count_cbs: DecompositionLevel,
+        base_log_cbs: DecompositionBaseLog,
+        delta_log: DeltaLog,
+        fft: FftView<'_>,
+        stack: PodStack<'_>,
+    ) {
+        debug_assert!(lwe_out.ciphertext_modulus() == lwe_in.ciphertext_modulus());
+        debug_assert!(
+            lwe_in.ciphertext_modulus().is_native_modulus(),
+            "This operation currently only supports native moduli"
+        );
+
+        let ciphertext_n_bits = Scalar::BITS;
+        let lwe_in_size = lwe_in.lwe_size();
+        let polynomial_size = fourier_bsk.polynomial_size();
+        let ciphertext_moudulus = lwe_out.ciphertext_modulus();
+
+        let (mut lwe_left_shift_buffer_data, stack) =
+            stack.make_aligned_with(lwe_in_size.0, CACHELINE_ALIGN, |_| Scalar::ZERO);
+        let mut lwe_left_shift_buffer = LweCiphertext::from_container(
+            &mut *lwe_left_shift_buffer_data,
+            lwe_in.ciphertext_modulus(),
+        );
+        // Shift message LSB on padding bit, at this point we expect to have messages with only 1
+        // bit of information
+        lwe_ciphertext_cleartext_mul(
+            &mut lwe_left_shift_buffer,
+            &lwe_in,
+            Cleartext(Scalar::ONE << (ciphertext_n_bits - delta_log.0 - 1)),
+        );
+
+        // Add q/4 to center the error while computing a negacyclic LUT
+        let shift_buffer_body = lwe_left_shift_buffer.get_mut_body();
+        *shift_buffer_body.data =
+            (*shift_buffer_body.data).wrapping_add(Scalar::ONE << (ciphertext_n_bits - 2));
+
+        let (mut pbs_accumulator_data, stack) = stack.make_aligned_with(
+            polynomial_size.0 * fourier_bsk.glwe_size().0,
+            CACHELINE_ALIGN,
+            |_| Scalar::ZERO,
+        );
+        let mut pbs_accumulator = GlweCiphertextMutView::from_container(
+            &mut *pbs_accumulator_data,
+            polynomial_size,
+            ciphertext_moudulus,
+        );
+
+        // Fill lut (equivalent to trivial encryption as mask is 0s)
+        // The LUT is filled with -alpha in each coefficient where
+        // alpha = 2^{log(q) - 1 - base_log * level}
+        pbs_accumulator
+            .get_mut_body()
+            .as_mut()
+            .fill(Scalar::ZERO.wrapping_sub(
+                Scalar::ONE << (ciphertext_n_bits - 1 - base_log_cbs.0 * level_count_cbs.0),
+            ));
+
+        // Applying a negacyclic LUT on a ciphertext with one bit of message in the MSB and no bit
+        // of padding
+        fourier_bsk.bootstrap(
+            lwe_out.as_mut_view(),
+            lwe_left_shift_buffer.as_view(),
+            pbs_accumulator.as_view(),
+            fft,
+            stack,
+        );
+
+        // Add alpha where alpha = 2^{log(q) - 1 - base_log * level}
+        // To end up with an encryption of 0 if the message bit was 0 and 1 in the other case
+        let out_body = lwe_out.get_mut_body();
+        *out_body.data = (*out_body.data).wrapping_add(
+            Scalar::ONE << (ciphertext_n_bits - 1 - base_log_cbs.0 * level_count_cbs.0),
+        );
+    }
+
+    pub fn cmux_tree_memory_optimized_scratch<Scalar>(
+        glwe_size: GlweSize,
+        polynomial_size: PolynomialSize,
+        nb_layer: usize,
+        fft: FftView<'_>,
+    ) -> Result<StackReq, SizeOverflow> {
+        let t_scratch = StackReq::try_new_aligned::<Scalar>(
+            polynomial_size.0 * glwe_size.0 * nb_layer,
+            CACHELINE_ALIGN,
+        )?;
+
+        StackReq::try_all_of([
+            t_scratch,                             // t_0
+            t_scratch,                             // t_1
+            StackReq::try_new::<usize>(nb_layer)?, // t_fill
+            t_scratch,                             // diff
+            add_external_product_assign_mem_optimized_requirement::<Scalar>(
+                glwe_size,
+                polynomial_size,
+                fft,
+            )?,
+        ])
+    }
+
+    /// Perform a tree of cmux in a way that limits the total allocated memory to avoid issues for
+    /// bigger trees.
+    pub fn cmux_tree_memory_optimized<Scalar: UnsignedTorus + CastInto<usize>>(
+        mut output_glwe: GlweCiphertext<&mut [Scalar]>,
+        lut_per_layer: PolynomialList<&[Scalar]>,
+        ggsw_list: FourierGgswCiphertextListView<'_>,
+        fft: FftView<'_>,
+        stack: PodStack<'_>,
+    ) {
+        debug_assert!(lut_per_layer.polynomial_count().0 == 1 << ggsw_list.count());
+
+        if ggsw_list.count() > 0 {
+            let glwe_size = output_glwe.glwe_size();
+            let ciphertext_modulus = output_glwe.ciphertext_modulus();
+            let polynomial_size = ggsw_list.polynomial_size();
+            let nb_layer = ggsw_list.count();
+
+            debug_assert!(stack.can_hold(
+                cmux_tree_memory_optimized_scratch::<Scalar>(
+                    glwe_size,
+                    polynomial_size,
+                    nb_layer,
+                    fft
+                )
+                .unwrap()
+            ));
+
+            // These are accumulator that will be used to propagate the result from layer to layer
+            // At index 0 you have the lut that will be loaded, and then the result for each layer
+            // gets computed at the next index, last layer result gets stored in
+            // `result`. This allow to use memory space in C * nb_layer instead of C' *
+            // 2 ^ nb_layer
+            let (mut t_0_data, stack) = stack.make_aligned_with(
+                polynomial_size.0 * glwe_size.0 * nb_layer,
+                CACHELINE_ALIGN,
+                |_| Scalar::ZERO,
+            );
+            let (mut t_1_data, stack) = stack.make_aligned_with(
+                polynomial_size.0 * glwe_size.0 * nb_layer,
+                CACHELINE_ALIGN,
+                |_| Scalar::ZERO,
+            );
+
+            let mut t_0 = GlweCiphertextList::from_container(
+                t_0_data.as_mut(),
+                glwe_size,
+                polynomial_size,
+                ciphertext_modulus,
+            );
+            let mut t_1 = GlweCiphertextList::from_container(
+                t_1_data.as_mut(),
+                glwe_size,
+                polynomial_size,
+                ciphertext_modulus,
+            );
+
+            let (mut t_fill, mut stack) = stack.make_with(nb_layer, |_| 0_usize);
+
+            let mut lut_polynomial_iter = lut_per_layer.iter();
+            loop {
+                let even = lut_polynomial_iter.next();
+                let odd = lut_polynomial_iter.next();
+
+                let (lut_2i, lut_2i_plus_1) = match (even, odd) {
+                    (Some(even), Some(odd)) => (even, odd),
+                    _ => break,
+                };
+
+                let mut t_iter = izip!(t_0.iter_mut(), t_1.iter_mut(),).enumerate();
+
+                let (mut j_counter, (mut t0_j, mut t1_j)) = t_iter.next().unwrap();
+
+                t0_j.get_mut_body()
+                    .as_mut()
+                    .copy_from_slice(lut_2i.as_ref());
+
+                t1_j.get_mut_body()
+                    .as_mut()
+                    .copy_from_slice(lut_2i_plus_1.as_ref());
+
+                t_fill[0] = 2;
+
+                for (j, ggsw) in ggsw_list.into_ggsw_iter().rev().enumerate() {
+                    if t_fill[j] == 2 {
+                        let (diff_data, stack) = stack.rb_mut().collect_aligned(
+                            CACHELINE_ALIGN,
+                            izip!(t1_j.as_ref(), t0_j.as_ref()).map(|(&a, &b)| a.wrapping_sub(b)),
+                        );
+                        let diff = GlweCiphertext::from_container(
+                            &*diff_data,
+                            polynomial_size,
+                            ciphertext_modulus,
+                        );
+
+                        if j != nb_layer - 1 {
+                            let (j_counter_plus_1, (mut t_0_j_plus_1, mut t_1_j_plus_1)) =
+                                t_iter.next().unwrap();
+
+                            assert_eq!(j_counter, j);
+                            assert_eq!(j_counter_plus_1, j + 1);
+
+                            let mut output = if t_fill[j + 1] == 0 {
+                                t_0_j_plus_1.as_mut_view()
+                            } else {
+                                t_1_j_plus_1.as_mut_view()
+                            };
+
+                            output.as_mut().copy_from_slice(t0_j.as_ref());
+                            add_external_product_assign_mem_optimized(
+                                &mut output,
+                                &ggsw,
+                                &diff,
+                                fft,
+                                stack,
+                            );
+                            t_fill[j + 1] += 1;
+                            t_fill[j] = 0;
+
+                            drop(diff_data);
+
+                            (j_counter, t0_j, t1_j) =
+                                (j_counter_plus_1, t_0_j_plus_1, t_1_j_plus_1);
+                        } else {
+                            let mut output = output_glwe.as_mut_view();
+                            output.as_mut().copy_from_slice(t0_j.as_ref());
+                            add_external_product_assign_mem_optimized(
+                                &mut output,
+                                &ggsw,
+                                &diff,
+                                fft,
+                                stack,
+                            );
+                        }
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else {
+            output_glwe.get_mut_mask().as_mut().fill(Scalar::ZERO);
+            output_glwe
+                .get_mut_body()
+                .as_mut()
+                .copy_from_slice(lut_per_layer.as_ref());
+        }
+    }
+
+    pub fn circuit_bootstrap_boolean_vertical_packing_scratch<Scalar>(
+        lwe_list_in_count: LweCiphertextCount,
+        lwe_list_out_count: LweCiphertextCount,
+        lwe_in_size: LweSize,
+        big_lut_polynomial_count: PolynomialCount,
+        bsk_output_lwe_size: LweSize,
+        glwe_size: GlweSize,
+        fpksk_output_polynomial_size: PolynomialSize,
+        level_cbs: DecompositionLevelCount,
+        fft: FftView<'_>,
+    ) -> Result<StackReq, SizeOverflow> {
+        // We deduce the number of luts in the vec_lut from the number of cipherxtexts in
+        // lwe_list_out
+        let number_of_luts = lwe_list_out_count.0;
+        let small_lut_size = PolynomialCount(big_lut_polynomial_count.0 / number_of_luts);
+
+        StackReq::try_all_of([
+            StackReq::try_new_aligned::<c64>(
+                lwe_list_in_count.0 * fpksk_output_polynomial_size.0 / 2
+                    * glwe_size.0
+                    * glwe_size.0
+                    * level_cbs.0,
+                CACHELINE_ALIGN,
+            )?,
+            StackReq::try_new_aligned::<Scalar>(
+                fpksk_output_polynomial_size.0 * glwe_size.0 * glwe_size.0 * level_cbs.0,
+                CACHELINE_ALIGN,
+            )?,
+            StackReq::try_any_of([
+                circuit_bootstrap_boolean_scratch::<Scalar>(
+                    lwe_in_size,
+                    bsk_output_lwe_size,
+                    glwe_size,
+                    fpksk_output_polynomial_size,
+                    fft,
+                )?,
+                convert_standard_ggsw_ciphertext_to_fourier_mem_optimized_requirement(fft)?,
+                vertical_packing_scratch::<Scalar>(
+                    glwe_size,
+                    fpksk_output_polynomial_size,
+                    small_lut_size,
+                    lwe_list_in_count.0,
+                    fft,
+                )?,
+            ])?,
+        ])
+    }
+
+    /// Perform a circuit bootstrap followed by a vertical packing on ciphertexts encrypting boolean
+    /// messages.
+    ///
+    /// The circuit bootstrapping uses the private functional packing key switch.
+    ///
+    /// This is supposed to be used only with boolean (1 bit of message) LWE ciphertexts.
+    pub fn circuit_bootstrap_boolean_vertical_packing<Scalar: UnsignedTorus + CastInto<usize>>(
+        big_lut_as_polynomial_list: PolynomialList<&[Scalar]>,
+        fourier_bsk: FourierLweBootstrapKeyView<'_>,
+        mut lwe_list_out: LweCiphertextList<&mut [Scalar]>,
+        lwe_list_in: LweCiphertextList<&[Scalar]>,
+        pfpksk_list: LwePrivateFunctionalPackingKeyswitchKeyList<&[Scalar]>,
+        level_cbs: DecompositionLevelCount,
+        base_log_cbs: DecompositionBaseLog,
+        fft: FftView<'_>,
+        stack: PodStack<'_>,
+    ) {
+        debug_assert!(stack.can_hold(
+            circuit_bootstrap_boolean_vertical_packing_scratch::<Scalar>(
+                lwe_list_in.lwe_ciphertext_count(),
+                lwe_list_out.lwe_ciphertext_count(),
+                lwe_list_in.lwe_size(),
+                big_lut_as_polynomial_list.polynomial_count(),
+                fourier_bsk.output_lwe_dimension().to_lwe_size(),
+                fourier_bsk.glwe_size(),
+                pfpksk_list.output_polynomial_size(),
+                level_cbs,
+                fft
+            )
+            .unwrap()
+        ));
+        debug_assert!(
+            lwe_list_in.lwe_ciphertext_count().0 != 0,
+            "Got empty `lwe_list_in`"
+        );
+        debug_assert!(
+            lwe_list_out.lwe_size().to_lwe_dimension() == fourier_bsk.output_lwe_dimension(),
+            "Output LWE ciphertext needs to have an LweDimension of {}, got {}",
+            lwe_list_out.lwe_size().to_lwe_dimension().0,
+            fourier_bsk.output_lwe_dimension().0
+        );
+        debug_assert!(lwe_list_out.ciphertext_modulus() == lwe_list_in.ciphertext_modulus());
+        debug_assert!(lwe_list_in.ciphertext_modulus() == pfpksk_list.ciphertext_modulus());
+        debug_assert!(
+            pfpksk_list.ciphertext_modulus().is_native_modulus(),
+            "This operation currently only supports native moduli"
+        );
+
+        let glwe_size = pfpksk_list.output_key_glwe_dimension().to_glwe_size();
+        let (mut ggsw_list_data, stack) = stack.make_aligned_with(
+            lwe_list_in.lwe_ciphertext_count().0 * pfpksk_list.output_polynomial_size().0 / 2
+                * glwe_size.0
+                * glwe_size.0
+                * level_cbs.0,
+            CACHELINE_ALIGN,
+            |_| c64::default(),
+        );
+        let (mut ggsw_res_data, mut stack) = stack.make_aligned_with(
+            pfpksk_list.output_polynomial_size().0 * glwe_size.0 * glwe_size.0 * level_cbs.0,
+            CACHELINE_ALIGN,
+            |_| Scalar::ZERO,
+        );
+
+        let mut ggsw_list = FourierGgswCiphertextListMutView::new(
+            &mut ggsw_list_data,
+            lwe_list_in.lwe_ciphertext_count().0,
+            glwe_size,
+            pfpksk_list.output_polynomial_size(),
+            base_log_cbs,
+            level_cbs,
+        );
+
+        let mut ggsw_res = GgswCiphertext::from_container(
+            &mut *ggsw_res_data,
+            glwe_size,
+            pfpksk_list.output_polynomial_size(),
+            base_log_cbs,
+            pfpksk_list.ciphertext_modulus(),
+        );
+
+        for (lwe_in, mut ggsw) in
+            izip!(lwe_list_in.iter(), ggsw_list.as_mut_view().into_ggsw_iter(),)
+        {
+            circuit_bootstrap_boolean(
+                fourier_bsk,
+                lwe_in,
+                ggsw_res.as_mut_view(),
+                DeltaLog(Scalar::BITS - 1),
+                pfpksk_list.as_view(),
+                fft,
+                stack.rb_mut(),
+            );
+
+            convert_standard_ggsw_ciphertext_to_fourier_mem_optimized(
+                &ggsw_res,
+                &mut ggsw,
+                fft,
+                stack.rb_mut(),
+            );
+        }
+
+        // We deduce the number of luts in the vec_lut from the number of cipherxtexts in
+        // lwe_list_out
+        let number_of_luts = lwe_list_out.lwe_ciphertext_count().0;
+
+        let small_lut_size = big_lut_as_polynomial_list.polynomial_count().0 / number_of_luts;
+
+        for (lut, lwe_out) in izip!(
+            big_lut_as_polynomial_list.chunks_exact(small_lut_size),
+            lwe_list_out.iter_mut(),
+        ) {
+            vertical_packing(lut, lwe_out, ggsw_list.as_view(), fft, stack.rb_mut());
+        }
+    }
+
+    pub fn vertical_packing_scratch<Scalar>(
+        glwe_size: GlweSize,
+        polynomial_size: PolynomialSize,
+        lut_polynomial_count: PolynomialCount,
+        ggsw_list_count: usize,
+        fft: FftView<'_>,
+    ) -> Result<StackReq, SizeOverflow> {
+        let bits = core::mem::size_of::<Scalar>() * 8;
+
+        // Get the base 2 logarithm (rounded down) of the number of polynomials in the list i.e. if
+        // there is one polynomial, the number will be 0
+        let log_lut_number: usize = bits - 1 - lut_polynomial_count.0.leading_zeros() as usize;
+
+        let log_number_of_luts_for_cmux_tree = if log_lut_number > ggsw_list_count {
+            // this means that we dont have enough GGSW to perform the CMux tree, we can only do the
+            // Blind rotation
+            0
+        } else {
+            log_lut_number
+        };
+
+        StackReq::try_all_of([
+            // cmux_tree_lut_res
+            StackReq::try_new_aligned::<Scalar>(polynomial_size.0 * glwe_size.0, CACHELINE_ALIGN)?,
+            StackReq::try_any_of([
+                blind_rotate_assign_scratch::<Scalar>(glwe_size, polynomial_size, fft)?,
+                cmux_tree_memory_optimized_scratch::<Scalar>(
+                    glwe_size,
+                    polynomial_size,
+                    log_number_of_luts_for_cmux_tree,
+                    fft,
+                )?,
+            ])?,
+        ])
+    }
+
+    // GGSW ciphertexts are stored from the msb (vec_ggsw[0]) to the lsb (vec_ggsw[last])
+    pub fn vertical_packing<Scalar: UnsignedTorus + CastInto<usize>>(
+        lut: PolynomialList<&[Scalar]>,
+        mut lwe_out: LweCiphertext<&mut [Scalar]>,
+        ggsw_list: FourierGgswCiphertextListView<'_>,
+        fft: FftView<'_>,
+        stack: PodStack<'_>,
+    ) {
+        debug_assert!(
+            lwe_out.ciphertext_modulus().is_native_modulus(),
+            "This operation currently only supports native moduli"
+        );
+
+        let polynomial_size = ggsw_list.polynomial_size();
+        let glwe_size = ggsw_list.glwe_size();
+        let glwe_dimension = glwe_size.to_glwe_dimension();
+        let ciphertext_modulus = lwe_out.ciphertext_modulus();
+
+        debug_assert!(
+            lwe_out.lwe_size().to_lwe_dimension()
+                == glwe_dimension.to_equivalent_lwe_dimension(polynomial_size),
+            "Output LWE ciphertext needs to have an LweDimension of {:?}, got {:?}",
+            glwe_dimension.to_equivalent_lwe_dimension(polynomial_size),
+            lwe_out.lwe_size().to_lwe_dimension(),
+        );
+
+        // Get the base 2 logarithm (rounded down) of the number of polynomials in the list i.e. if
+        // there is one polynomial, the number will be 0
+        let log_lut_number: usize =
+            Scalar::BITS - 1 - lut.polynomial_count().0.leading_zeros() as usize;
+
+        let log_number_of_luts_for_cmux_tree = if log_lut_number > ggsw_list.count() {
+            // this means that we dont have enough GGSW to perform the CMux tree, we can only do the
+            // Blind rotation
+            0
+        } else {
+            log_lut_number
+        };
+
+        // split the vec of GGSW in two, the msb GGSW is for the CMux tree and the lsb GGSW is for
+        // the last blind rotation.
+        let (cmux_ggsw, br_ggsw) = ggsw_list.split_at(log_number_of_luts_for_cmux_tree);
+
+        let (mut cmux_tree_lut_res_data, mut stack) =
+            stack.make_aligned_with(polynomial_size.0 * glwe_size.0, CACHELINE_ALIGN, |_| {
+                Scalar::ZERO
+            });
+        let mut cmux_tree_lut_res = GlweCiphertext::from_container(
+            &mut *cmux_tree_lut_res_data,
+            polynomial_size,
+            ciphertext_modulus,
+        );
+
+        cmux_tree_memory_optimized(
+            cmux_tree_lut_res.as_mut_view(),
+            lut,
+            cmux_ggsw,
+            fft,
+            stack.rb_mut(),
+        );
+        blind_rotate_assign(
+            cmux_tree_lut_res.as_mut_view(),
+            br_ggsw,
+            fft,
+            stack.rb_mut(),
+        );
+
+        // sample extract of the RLWE of the Vertical packing
+        extract_lwe_sample_from_glwe_ciphertext(&cmux_tree_lut_res, &mut lwe_out, MonomialDegree(0))
+    }
+
+    pub fn blind_rotate_assign_scratch<Scalar>(
+        glwe_size: GlweSize,
+        polynomial_size: PolynomialSize,
+        fft: FftView<'_>,
+    ) -> Result<StackReq, SizeOverflow> {
+        StackReq::try_all_of([
+            StackReq::try_new_aligned::<Scalar>(polynomial_size.0 * glwe_size.0, CACHELINE_ALIGN)?,
+            cmux_assign_mem_optimized_requirement::<Scalar>(glwe_size, polynomial_size, fft)?,
+        ])
+    }
+
+    pub fn blind_rotate_assign<Scalar: UnsignedTorus + CastInto<usize>>(
+        mut lut: GlweCiphertext<&mut [Scalar]>,
+        ggsw_list: FourierGgswCiphertextListView<'_>,
+        fft: FftView<'_>,
+        mut stack: PodStack<'_>,
+    ) {
+        let mut monomial_degree = MonomialDegree(1);
+
+        for ggsw in ggsw_list.into_ggsw_iter().rev() {
+            let mut ct_0 = lut.as_mut_view();
+            let (mut ct1_data, stack) = stack
+                .rb_mut()
+                .collect_aligned(CACHELINE_ALIGN, ct_0.as_ref().iter().copied());
+            let mut ct_1 = GlweCiphertext::from_container(
+                &mut *ct1_data,
+                ct_0.polynomial_size(),
+                ct_0.ciphertext_modulus(),
+            );
+            ct_1.as_mut_polynomial_list()
+                .iter_mut()
+                .for_each(|mut poly| {
+                    polynomial_wrapping_monic_monomial_div_assign(&mut poly, monomial_degree)
+                });
+            monomial_degree.0 <<= 1;
+            cmux_assign_mem_optimized(&mut ct_0, &mut ct_1, &ggsw, fft, stack);
+        }
+    }
+}
+use implementation::*;
+
 /// Allocate a new [`list of LWE private functional packing keyswitch
 /// keys`](`LwePrivateFunctionalPackingKeyswitchKeyList`) and fill it with actual keys required to
 /// perform a circuit bootstrap.
@@ -730,3 +1663,6 @@ pub fn circuit_bootstrap_boolean_vertical_packing_lwe_ciphertext_list_mem_optimi
        fft,
    )
 }
+
+#[cfg(test)]
+mod tests;
--- a/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/tests.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/tests.rs
@@ -3,19 +3,17 @@ use crate::core_crypto::algorithms::slice_algorithms::*;
 use crate::core_crypto::commons::dispersion::{LogStandardDev, StandardDev};
 use crate::core_crypto::commons::generators::{EncryptionRandomGenerator, SecretRandomGenerator};
 use crate::core_crypto::commons::math::decomposition::SignedDecomposer;
+use crate::core_crypto::commons::math::fft64::Fft;
 use crate::core_crypto::commons::parameters::{
    DecompositionBaseLog, DecompositionLevelCount, DeltaLog, ExtractedBitsCount, GlweDimension,
    LweDimension, PlaintextCount, PolynomialCount, PolynomialSize,
 };
 use crate::core_crypto::commons::test_tools;
-use crate::core_crypto::fft_impl::fft64::crypto::bootstrap::{
-    fill_with_forward_fourier_scratch, FourierLweBootstrapKey,
-};
-use crate::core_crypto::fft_impl::fft64::math::fft::Fft;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::seeders::new_seeder;
 use concrete_csprng::generators::SoftwareRandomGenerator;
 use concrete_fft::c64;
-use dyn_stack::{GlobalPodBuffer, PodStack, ReborrowMut, StackReq};
+use dyn_stack::{GlobalPodBuffer, PodStack, ReborrowMut};

 // Extract all the bits of a LWE
 #[test]
@@ -90,19 +88,14 @@ pub fn test_extract_bits() {

    let input_lwe_dimension = lwe_big_sk.lwe_dimension();

-    let req = || {
-        StackReq::try_any_of([
-            fill_with_forward_fourier_scratch(fft)?,
-            extract_bits_scratch::<u64>(
-                input_lwe_dimension,
-                ksk_lwe_big_to_small.output_key_lwe_dimension(),
-                glwe_dimension.to_glwe_size(),
-                polynomial_size,
-                fft,
-            )?,
-        ])
-    };
-    let req = req().unwrap();
+    let req = extract_bits_scratch::<u64>(
+        input_lwe_dimension,
+        ksk_lwe_big_to_small.output_key_lwe_dimension(),
+        glwe_dimension.to_glwe_size(),
+        polynomial_size,
+        fft,
+    )
+    .unwrap();
    let mut mem = GlobalPodBuffer::new(req);
    let mut stack = PodStack::new(&mut mem);

@@ -481,11 +474,7 @@ pub fn test_cmux_tree() {
                &mut encryption_generator,
            );

-            let mut mem = GlobalPodBuffer::new(fill_with_forward_fourier_scratch(fft).unwrap());
-            let stack = PodStack::new(&mut mem);
-            fourier_ggsw
-                .as_mut_view()
-                .fill_with_forward_fourier(ggsw.as_view(), fft, stack);
+            convert_standard_ggsw_ciphertext_to_fourier(&ggsw, &mut fourier_ggsw);
        }

        let mut result_cmux_tree =
--- a/tfhe/src/core_crypto/commons/math/decomposition/iter.rs
+++ b/tfhe/src/core_crypto/commons/math/decomposition/iter.rs
@@ -4,6 +4,7 @@ use crate::core_crypto::commons::math::decomposition::{
 };
 use crate::core_crypto::commons::numeric::UnsignedInteger;
 use crate::core_crypto::commons::parameters::{DecompositionBaseLog, DecompositionLevelCount};
+use crate::core_crypto::commons::utils::izip;

 /// An iterator that yields the terms of the signed decomposition of an integer.
 ///
@@ -30,6 +31,20 @@ where
    fresh: bool,
 }

+pub struct TensorSignedDecompositionLendingIter<'buffer, Scalar: UnsignedInteger> {
+    // The base log of the decomposition
+    base_log: usize,
+    // The current level
+    current_level: usize,
+    // A mask which allows to compute the mod B of a value. For B=2^4, this guy is of the form:
+    // ...0001111
+    mod_b_mask: Scalar,
+    // The internal states of each decomposition
+    states: &'buffer mut [Scalar],
+    // A flag which stores whether the iterator is a fresh one (for the recompose method).
+    fresh: bool,
+}
+
 impl<T> SignedDecompositionIter<T>
 where
    T: UnsignedInteger,
@@ -117,6 +132,63 @@ where
    }
 }

+impl<'buffer, Scalar: UnsignedInteger> TensorSignedDecompositionLendingIter<'buffer, Scalar> {
+    #[inline]
+    pub(crate) fn new(
+        input: impl Iterator<Item = Scalar>,
+        base_log: DecompositionBaseLog,
+        level: DecompositionLevelCount,
+        states: &'buffer mut [Scalar],
+    ) -> Self {
+        let shift = Scalar::BITS - base_log.0 * level.0;
+
+        izip!(&mut *states, input).for_each(|(dst, i)| *dst = i >> shift);
+        TensorSignedDecompositionLendingIter {
+            base_log: base_log.0,
+            current_level: level.0,
+            mod_b_mask: (Scalar::ONE << base_log.0) - Scalar::ONE,
+            states,
+            fresh: true,
+        }
+    }
+
+    // inlining this improves perf of external product by about 25%, even in LTO builds
+    #[inline]
+    pub fn fill_next_term(
+        &mut self,
+        next_term: &mut [Scalar],
+    ) -> Option<(DecompositionLevel, DecompositionBaseLog)> {
+        // The iterator is not fresh anymore.
+        self.fresh = false;
+        // We check if the decomposition is over
+        if self.current_level == 0 {
+            return None;
+        }
+        let current_level = self.current_level;
+        let base_log = self.base_log;
+        let mod_b_mask = self.mod_b_mask;
+        self.current_level -= 1;
+
+        #[inline]
+        fn implementation<Scalar: UnsignedInteger>(
+            next_term: &mut [Scalar],
+            states: &mut [Scalar],
+            base_log: usize,
+            mod_b_mask: Scalar,
+        ) {
+            izip!(next_term, states)
+                .for_each(|(term, state)| *term = decompose_one_level(base_log, state, mod_b_mask));
+        }
+
+        implementation(next_term, self.states, base_log, mod_b_mask);
+
+        Some((
+            DecompositionLevel(current_level),
+            DecompositionBaseLog(self.base_log),
+        ))
+    }
+}
+
 fn decompose_one_level<S: UnsignedInteger>(base_log: usize, state: &mut S, mod_b_mask: S) -> S {
    let res = *state & mod_b_mask;
    *state >>= base_log;
--- a/tfhe/src/core_crypto/fft_impl/fft64/math/fft/mod.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/math/fft/mod.rs
@@ -1,9 +1,13 @@
-use super::polynomial::{FourierPolynomialMutView, FourierPolynomialView};
 use crate::core_crypto::commons::math::torus::UnsignedTorus;
 use crate::core_crypto::commons::numeric::CastInto;
-use crate::core_crypto::commons::parameters::{PolynomialCount, PolynomialSize};
-use crate::core_crypto::commons::traits::{Container, ContainerMut, IntoContainerOwned};
+use crate::core_crypto::commons::parameters::PolynomialSize;
+use crate::core_crypto::commons::traits::container::Split;
+use crate::core_crypto::commons::traits::{Container, IntoContainerOwned};
 use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::entities::fourier_polynomial::{
+    FourierPolynomialMutView, FourierPolynomialView,
+};
+use crate::core_crypto::entities::fourier_polynomial_list::FourierPolynomialList;
 use crate::core_crypto::entities::*;
 use aligned_vec::{avec, ABox};
 use concrete_fft::c64;
@@ -21,6 +25,9 @@ use std::time::Duration;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 mod x86;

+#[cfg(test)]
+mod tests;
+
 /// Twisting factors from the paper:
 /// [Fast and Error-Free Negacyclic Integer Convolution using Extended Fourier Transform][paper]
 ///
@@ -375,13 +382,13 @@ impl<'a> FftView<'a> {
    ///
    /// Panics if `standard` and `self` have differing polynomial sizes, or if `fourier` doesn't
    /// have size equal to that amount divided by two.
-    pub fn forward_as_torus<'out, Scalar: UnsignedTorus>(
+    pub fn forward_as_torus<Scalar: UnsignedTorus>(
        self,
-        fourier: FourierPolynomialMutView<'out>,
+        fourier: FourierPolynomialMutView<'_>,
        standard: PolynomialView<'_, Scalar>,
        stack: PodStack<'_>,
-    ) -> FourierPolynomialMutView<'out> {
-        self.forward_with_conv(fourier, standard, convert_forward_torus, stack)
+    ) {
+        self.forward_with_conv(fourier, standard, convert_forward_torus, stack);
    }

    /// Perform a negacyclic real FFT of `standard`, viewed as integers, and stores the result in
@@ -395,23 +402,23 @@ impl<'a> FftView<'a> {
    ///
    /// Panics if `standard` and `self` have differing polynomial sizes, or if `fourier` doesn't
    /// have size equal to that amount divided by two.
-    pub fn forward_as_integer<'out, Scalar: UnsignedTorus>(
+    pub fn forward_as_integer<Scalar: UnsignedTorus>(
        self,
-        fourier: FourierPolynomialMutView<'out>,
+        fourier: FourierPolynomialMutView<'_>,
        standard: PolynomialView<'_, Scalar>,
        stack: PodStack<'_>,
-    ) -> FourierPolynomialMutView<'out> {
-        self.forward_with_conv(fourier, standard, convert_forward_integer, stack)
+    ) {
+        self.forward_with_conv(fourier, standard, convert_forward_integer, stack);
    }

    #[must_use]
    pub fn incomplete_monomial_forward_as_integer(
        self,
-        fourier: FourierPolynomialMutView<'_>,
+        mut fourier: FourierPolynomialMutView<'_>,
        degree: usize,
    ) -> c64 {
        let n = self.polynomial_size().0;
-        let fourier = fourier.data;
+        let fourier = fourier.as_mut();

        let negate = (degree / n) % 2 == 1;
        let degree = degree % n;
@@ -494,24 +501,22 @@ impl<'a> FftView<'a> {
    }

    fn forward_with_conv<
-        'out,
        Scalar: UnsignedTorus,
        F: Fn(&mut [c64], &[Scalar], &[Scalar], TwistiesView<'_>),
    >(
        self,
-        fourier: FourierPolynomialMutView<'out>,
+        mut fourier: FourierPolynomialMutView<'_>,
        standard: PolynomialView<'_, Scalar>,
        conv_fn: F,
        stack: PodStack<'_>,
-    ) -> FourierPolynomialMutView<'out> {
-        let fourier = fourier.data;
+    ) {
+        let fourier = fourier.as_mut();
        let standard = standard.as_ref();
        let n = standard.len();
        debug_assert_eq!(n, 2 * fourier.len());
        let (standard_re, standard_im) = standard.split_at(n / 2);
        conv_fn(fourier, standard_re, standard_im, self.twisties);
        self.plan.fwd(fourier, stack);
-        FourierPolynomialMutView { data: fourier }
    }

    fn backward_with_conv<
@@ -524,7 +529,7 @@ impl<'a> FftView<'a> {
        conv_fn: F,
        stack: PodStack<'_>,
    ) {
-        let fourier = fourier.data;
+        let fourier = fourier.as_ref();
        let standard = standard.as_mut();
        let n = standard.len();
        debug_assert_eq!(n, 2 * fourier.len());
@@ -542,11 +547,11 @@ impl<'a> FftView<'a> {
    >(
        self,
        mut standard: PolynomialMutView<'_, Scalar>,
-        fourier: FourierPolynomialMutView<'_>,
+        mut fourier: FourierPolynomialMutView<'_>,
        conv_fn: F,
        stack: PodStack<'_>,
    ) {
-        let fourier = fourier.data;
+        let fourier = fourier.as_mut();
        let standard = standard.as_mut();
        let n = standard.len();
        debug_assert_eq!(n, 2 * fourier.len());
@@ -557,34 +562,6 @@ impl<'a> FftView<'a> {
    }
 }

-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub struct FourierPolynomialList<C: Container<Element = c64>> {
-    pub data: C,
-    pub polynomial_size: PolynomialSize,
-}
-
-impl<C: Container<Element = c64>> FourierPolynomialList<C> {
-    pub fn polynomial_count(&self) -> PolynomialCount {
-        PolynomialCount(
-            self.data.container_len() / self.polynomial_size.to_fourier_polynomial_size().0,
-        )
-    }
-}
-
-impl<C: ContainerMut<Element = c64>> FourierPolynomialList<C> {
-    pub fn iter_mut(
-        &mut self,
-    ) -> impl DoubleEndedIterator<Item = FourierPolynomial<&'_ mut [c64]>> {
-        assert!(
-            self.data.container_len() % self.polynomial_size.to_fourier_polynomial_size().0 == 0
-        );
-        self.data
-            .as_mut()
-            .chunks_exact_mut(self.polynomial_size.to_fourier_polynomial_size().0)
-            .map(move |slice| FourierPolynomial { data: slice })
-    }
-}
-
 impl<C: Container<Element = c64>> serde::Serialize for FourierPolynomialList<C> {
    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
        fn serialize_impl<S: serde::Serializer>(
@@ -592,8 +569,6 @@ impl<C: Container<Element = c64>> serde::Serialize for FourierPolynomialList<C>
            polynomial_size: PolynomialSize,
            serializer: S,
        ) -> Result<S::Ok, S::Error> {
-            use crate::core_crypto::commons::traits::Split;
-
            pub struct SingleFourierPolynomial<'a> {
                fft: FftView<'a>,
                buf: &'a [c64],
@@ -654,8 +629,6 @@ impl<'de, C: IntoContainerOwned<Element = c64>> serde::Deserialize<'de>
                self,
                mut seq: A,
            ) -> Result<Self::Value, A::Error> {
-                use crate::core_crypto::commons::traits::Split;
-
                let str = "sequence of two fields and Fourier polynomials";
                let polynomial_size = match seq.next_element::<PolynomialSize>()? {
                    Some(polynomial_size) => polynomial_size,
@@ -738,16 +711,15 @@ pub fn par_convert_polynomials_list_to_fourier<Scalar: UnsignedTorus>(
            1
        };

+    let stack_len = fft
+        .forward_scratch()
+        .and_then(|req| req.try_unaligned_bytes_required())
+        .unwrap();
+
    dest.par_chunks_mut(chunk_size * f_polynomial_size)
        .zip_eq(origin.par_chunks(chunk_size * polynomial_size.0))
        .for_each(|(fourier_poly_chunk, standard_poly_chunk)| {
-            let stack_len = fft
-                .forward_scratch()
-                .unwrap()
-                .try_unaligned_bytes_required()
-                .unwrap();
            let mut stack = vec![0; stack_len];
-
            let mut stack = PodStack::new(&mut stack);

            for (fourier_poly, standard_poly) in izip!(
@@ -755,7 +727,7 @@ pub fn par_convert_polynomials_list_to_fourier<Scalar: UnsignedTorus>(
                standard_poly_chunk.chunks_exact(polynomial_size.0)
            ) {
                fft.forward_as_torus(
-                    FourierPolynomialMutView { data: fourier_poly },
+                    FourierPolynomialMutView::from_container(fourier_poly),
                    PolynomialView::from_container(standard_poly),
                    stack.rb_mut(),
                );
@@ -763,5 +735,143 @@ pub fn par_convert_polynomials_list_to_fourier<Scalar: UnsignedTorus>(
        });
 }

-#[cfg(test)]
-mod tests;
+#[cfg_attr(__profiling, inline(never))]
+pub(crate) fn update_with_fmadd(
+    output_fft_buffer: &mut [c64],
+    lhs_polynomial_list: &[c64],
+    fourier: &[c64],
+    is_output_uninit: bool,
+    fourier_poly_size: usize,
+) {
+    struct Impl<'a> {
+        output_fft_buffer: &'a mut [c64],
+        lhs_polynomial_list: &'a [c64],
+        fourier: &'a [c64],
+        is_output_uninit: bool,
+        fourier_poly_size: usize,
+    }
+
+    impl pulp::WithSimd for Impl<'_> {
+        type Output = ();
+
+        #[inline(always)]
+        fn with_simd<S: pulp::Simd>(self, simd: S) -> Self::Output {
+            // Introducing a function boundary here means that the slices
+            // get `noalias` markers, possibly allowing better optimizations from LLVM.
+            //
+            // see:
+            // https://github.com/rust-lang/rust/blob/56e1aaadb31542b32953292001be2312810e88fd/library/core/src/slice/mod.rs#L960-L966
+            #[inline(always)]
+            fn implementation<S: pulp::Simd>(
+                simd: S,
+                output_fft_buffer: &mut [c64],
+                lhs_polynomial_list: &[c64],
+                fourier: &[c64],
+                is_output_uninit: bool,
+                fourier_poly_size: usize,
+            ) {
+                let rhs = S::c64s_as_simd(fourier).0;
+
+                if is_output_uninit {
+                    for (output_fourier, ggsw_poly) in izip!(
+                        output_fft_buffer.into_chunks(fourier_poly_size),
+                        lhs_polynomial_list.into_chunks(fourier_poly_size)
+                    ) {
+                        let out = S::c64s_as_mut_simd(output_fourier).0;
+                        let lhs = S::c64s_as_simd(ggsw_poly).0;
+
+                        for (out, &lhs, &rhs) in izip!(out, lhs, rhs) {
+                            *out = simd.c64s_mul(lhs, rhs);
+                        }
+                    }
+                } else {
+                    for (output_fourier, ggsw_poly) in izip!(
+                        output_fft_buffer.into_chunks(fourier_poly_size),
+                        lhs_polynomial_list.into_chunks(fourier_poly_size)
+                    ) {
+                        let out = S::c64s_as_mut_simd(output_fourier).0;
+                        let lhs = S::c64s_as_simd(ggsw_poly).0;
+
+                        for (out, &lhs, &rhs) in izip!(out, lhs, rhs) {
+                            *out = simd.c64s_mul_adde(lhs, rhs, *out);
+                        }
+                    }
+                }
+            }
+
+            implementation(
+                simd,
+                self.output_fft_buffer,
+                self.lhs_polynomial_list,
+                self.fourier,
+                self.is_output_uninit,
+                self.fourier_poly_size,
+            )
+        }
+    }
+
+    pulp::Arch::new().dispatch(Impl {
+        output_fft_buffer,
+        lhs_polynomial_list,
+        fourier,
+        is_output_uninit,
+        fourier_poly_size,
+    })
+}
+
+pub(crate) fn update_with_fmadd_factor(
+    output_fft_buffer: &mut [c64],
+    lhs_polynomial_list: &[c64],
+    fourier: &[c64],
+    factor: c64,
+    is_output_uninit: bool,
+    fourier_poly_size: usize,
+) {
+    struct Impl<'a> {
+        output_fft_buffer: &'a mut [c64],
+        lhs_polynomial_list: &'a [c64],
+        fourier: &'a [c64],
+        factor: c64,
+        is_output_uninit: bool,
+        fourier_poly_size: usize,
+    }
+
+    impl pulp::WithSimd for Impl<'_> {
+        type Output = ();
+
+        #[inline(always)]
+        fn with_simd<S: pulp::Simd>(self, simd: S) -> Self::Output {
+            let factor = simd.c64s_splat(self.factor);
+
+            for (output_fourier, ggsw_poly) in izip!(
+                self.output_fft_buffer.into_chunks(self.fourier_poly_size),
+                self.lhs_polynomial_list.into_chunks(self.fourier_poly_size)
+            ) {
+                let out = S::c64s_as_mut_simd(output_fourier).0;
+                let lhs = S::c64s_as_simd(ggsw_poly).0;
+                let rhs = S::c64s_as_simd(self.fourier).0;
+
+                if self.is_output_uninit {
+                    for (out, &lhs, &rhs) in izip!(out, lhs, rhs) {
+                        // NOTE: factor * (lhs * rhs) is more efficient than (lhs * rhs) * factor
+                        *out = simd.c64s_mul(factor, simd.c64s_mul(lhs, rhs));
+                    }
+                } else {
+                    for (out, &lhs, &rhs) in izip!(out, lhs, rhs) {
+                        // NOTE: see above
+                        *out = simd.c64s_mul_adde(factor, simd.c64s_mul(lhs, rhs), *out);
+                    }
+                }
+            }
+        }
+    }
+
+    pulp::Arch::new().dispatch(Impl {
+        output_fft_buffer,
+        lhs_polynomial_list,
+        fourier,
+        factor,
+        is_output_uninit,
+        fourier_poly_size,
+    })
+}
--- a/tfhe/src/core_crypto/fft_impl/fft64/math/fft/tests.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/math/fft/tests.rs
@@ -1,8 +1,8 @@
 use dyn_stack::{GlobalPodBuffer, ReborrowMut};

-use super::super::polynomial::FourierPolynomial;
 use super::*;
 use crate::core_crypto::commons::test_tools::new_random_generator;
+use crate::core_crypto::entities::fourier_polynomial::FourierPolynomial;
 use crate::core_crypto::entities::Polynomial;
 use aligned_vec::avec;

@@ -26,9 +26,8 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
        let mut poly = Polynomial::from_container(avec![Scalar::ZERO; size].into_boxed_slice());
        let mut roundtrip =
            Polynomial::from_container(avec![Scalar::ZERO; size].into_boxed_slice());
-        let mut fourier = FourierPolynomial {
-            data: avec![c64::default(); size / 2].into_boxed_slice(),
-        };
+        let mut fourier =
+            FourierPolynomial::from_container(avec![c64::default(); size / 2].into_boxed_slice());

        for x in poly.as_mut().iter_mut() {
            *x = generator.random_uniform();
@@ -126,12 +125,12 @@ fn test_product<Scalar: UnsignedTorus>() {
            let mut convolution_from_naive =
                Polynomial::from_container(avec![Scalar::ZERO; size].into_boxed_slice());

-            let mut fourier0 = FourierPolynomial {
-                data: avec![c64::default(); size / 2].into_boxed_slice(),
-            };
-            let mut fourier1 = FourierPolynomial {
-                data: avec![c64::default(); size / 2 ].into_boxed_slice(),
-            };
+            let mut fourier0 = FourierPolynomial::from_container(
+                avec![c64::default(); size / 2].into_boxed_slice(),
+            );
+            let mut fourier1 = FourierPolynomial::from_container(
+                avec![c64::default(); size / 2 ].into_boxed_slice(),
+            );

            let integer_magnitude = 16;
            for (x, y) in izip!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
@@ -150,7 +149,7 @@ fn test_product<Scalar: UnsignedTorus>() {
            fft.forward_as_torus(fourier0.as_mut_view(), poly0.as_view(), stack.rb_mut());
            fft.forward_as_integer(fourier1.as_mut_view(), poly1.as_view(), stack.rb_mut());

-            for (f0, f1) in izip!(&mut *fourier0.data, &*fourier1.data) {
+            for (f0, f1) in izip!(fourier0.as_mut(), fourier1.as_ref()) {
                *f0 *= *f1;
            }

--- a/tfhe/src/core_crypto/fft_impl/fft64/math/fft/x86.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/math/fft/x86.rs
@@ -14,9 +14,9 @@ use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::*;

-use super::super::super::c64;
 use super::TwistiesView;
 use crate::core_crypto::commons::utils::izip;
+use concrete_fft::c64;

 use pulp::x86::V3;
 #[cfg(feature = "nightly-avx512")]
@@ -1089,10 +1089,7 @@ pub fn convert_add_backward_torus_u64(

 #[cfg(test)]
 mod tests {
-    use crate::core_crypto::fft_impl::fft64::math::fft::{
-        convert_add_backward_torus_scalar, Twisties,
-    };
-
+    use super::super::{convert_add_backward_torus_scalar, Twisties};
    use super::*;

    #[test]
--- a/tfhe/src/core_crypto/commons/math/mod.rs
+++ b/tfhe/src/core_crypto/commons/math/mod.rs
@@ -1,5 +1,6 @@
 //! A module containing general mathematical tools.

 pub mod decomposition;
+pub mod fft64;
 pub mod random;
 pub mod torus;
--- a/tfhe/src/core_crypto/entities/fourier_ggsw_ciphertext.rs
+++ b/tfhe/src/core_crypto/entities/fourier_ggsw_ciphertext.rs
@@ -0,0 +1,403 @@
+use super::fourier_polynomial_list::FourierPolynomialList;
+use crate::core_crypto::commons::math::decomposition::DecompositionLevel;
+use crate::core_crypto::commons::parameters::{
+    DecompositionBaseLog, DecompositionLevelCount, GlweSize, PolynomialSize,
+};
+use crate::core_crypto::commons::traits::{Container, Split};
+use crate::core_crypto::prelude::IntoContainerOwned;
+use aligned_vec::{avec, ABox};
+use concrete_fft::c64;
+
+/// A GGSW ciphertext in the Fourier domain.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(bound(deserialize = "C: IntoContainerOwned"))]
+pub struct FourierGgswCiphertext<C: Container<Element = c64>> {
+    fourier: FourierPolynomialList<C>,
+    glwe_size: GlweSize,
+    decomposition_base_log: DecompositionBaseLog,
+    decomposition_level_count: DecompositionLevelCount,
+}
+
+/// A matrix containing a single level of gadget decomposition, in the Fourier domain.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct FourierGgswLevelMatrix<C: Container<Element = c64>> {
+    data: C,
+    glwe_size: GlweSize,
+    polynomial_size: PolynomialSize,
+    decomposition_level: DecompositionLevel,
+}
+
+/// A row of a GGSW level matrix, in the Fourier domain.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct FourierGgswLevelRow<C: Container<Element = c64>> {
+    data: C,
+    glwe_size: GlweSize,
+    polynomial_size: PolynomialSize,
+    decomposition_level: DecompositionLevel,
+}
+
+pub type FourierGgswCiphertextView<'a> = FourierGgswCiphertext<&'a [c64]>;
+pub type FourierGgswCiphertextMutView<'a> = FourierGgswCiphertext<&'a mut [c64]>;
+pub type FourierGgswCiphertextOwned = FourierGgswCiphertext<ABox<[c64]>>;
+
+pub type FourierGgswLevelMatrixView<'a> = FourierGgswLevelMatrix<&'a [c64]>;
+pub type FourierGgswLevelMatrixMutView<'a> = FourierGgswLevelMatrix<&'a mut [c64]>;
+pub type FourierGgswLevelRowView<'a> = FourierGgswLevelRow<&'a [c64]>;
+pub type FourierGgswLevelRowMutView<'a> = FourierGgswLevelRow<&'a mut [c64]>;
+
+impl<C: Container<Element = c64>> FourierGgswCiphertext<C> {
+    pub fn from_container(
+        data: C,
+        glwe_size: GlweSize,
+        polynomial_size: PolynomialSize,
+        decomposition_base_log: DecompositionBaseLog,
+        decomposition_level_count: DecompositionLevelCount,
+    ) -> Self {
+        assert_eq!(
+            data.container_len(),
+            polynomial_size.to_fourier_polynomial_size().0
+                * glwe_size.0
+                * glwe_size.0
+                * decomposition_level_count.0
+        );
+
+        Self {
+            fourier: FourierPolynomialList {
+                data,
+                polynomial_size,
+            },
+            glwe_size,
+            decomposition_base_log,
+            decomposition_level_count,
+        }
+    }
+
+    pub fn polynomial_size(&self) -> PolynomialSize {
+        self.fourier.polynomial_size
+    }
+
+    pub fn glwe_size(&self) -> GlweSize {
+        self.glwe_size
+    }
+
+    pub fn decomposition_base_log(&self) -> DecompositionBaseLog {
+        self.decomposition_base_log
+    }
+
+    pub fn decomposition_level_count(&self) -> DecompositionLevelCount {
+        self.decomposition_level_count
+    }
+
+    pub fn data(self) -> C {
+        self.fourier.data
+    }
+
+    pub fn as_view(&self) -> FourierGgswCiphertextView<'_>
+    where
+        C: AsRef<[c64]>,
+    {
+        FourierGgswCiphertextView {
+            fourier: FourierPolynomialList {
+                data: self.fourier.data.as_ref(),
+                polynomial_size: self.fourier.polynomial_size,
+            },
+            glwe_size: self.glwe_size,
+            decomposition_base_log: self.decomposition_base_log,
+            decomposition_level_count: self.decomposition_level_count,
+        }
+    }
+
+    pub fn as_mut_view(&mut self) -> FourierGgswCiphertextMutView<'_>
+    where
+        C: AsMut<[c64]>,
+    {
+        FourierGgswCiphertextMutView {
+            fourier: FourierPolynomialList {
+                data: self.fourier.data.as_mut(),
+                polynomial_size: self.fourier.polynomial_size,
+            },
+            glwe_size: self.glwe_size,
+            decomposition_base_log: self.decomposition_base_log,
+            decomposition_level_count: self.decomposition_level_count,
+        }
+    }
+}
+
+impl<C: Container<Element = c64>> FourierGgswLevelMatrix<C> {
+    pub fn new(
+        data: C,
+        glwe_size: GlweSize,
+        polynomial_size: PolynomialSize,
+        decomposition_level: DecompositionLevel,
+    ) -> Self {
+        assert_eq!(
+            data.container_len(),
+            polynomial_size.to_fourier_polynomial_size().0 * glwe_size.0 * glwe_size.0
+        );
+        Self {
+            data,
+            polynomial_size,
+            glwe_size,
+            decomposition_level,
+        }
+    }
+
+    /// Return an iterator over the rows of the level matrices.
+    pub fn into_rows(self) -> impl DoubleEndedIterator<Item = FourierGgswLevelRow<C>>
+    where
+        C: Split,
+    {
+        self.data
+            .split_into(self.glwe_size.0)
+            .map(move |slice| FourierGgswLevelRow {
+                data: slice,
+                polynomial_size: self.polynomial_size,
+                glwe_size: self.glwe_size,
+                decomposition_level: self.decomposition_level,
+            })
+    }
+
+    pub fn polynomial_size(&self) -> PolynomialSize {
+        self.polynomial_size
+    }
+
+    pub fn glwe_size(&self) -> GlweSize {
+        self.glwe_size
+    }
+
+    pub fn decomposition_level(&self) -> DecompositionLevel {
+        self.decomposition_level
+    }
+
+    pub fn data(self) -> C {
+        self.data
+    }
+}
+
+impl<C: Container<Element = c64>> FourierGgswLevelRow<C> {
+    pub fn new(
+        data: C,
+        glwe_size: GlweSize,
+        polynomial_size: PolynomialSize,
+        decomposition_level: DecompositionLevel,
+    ) -> Self {
+        assert_eq!(
+            data.container_len(),
+            polynomial_size.to_fourier_polynomial_size().0 * glwe_size.0
+        );
+        Self {
+            data,
+            polynomial_size,
+            glwe_size,
+            decomposition_level,
+        }
+    }
+
+    pub fn polynomial_size(&self) -> PolynomialSize {
+        self.polynomial_size
+    }
+
+    pub fn glwe_size(&self) -> GlweSize {
+        self.glwe_size
+    }
+
+    pub fn decomposition_level(&self) -> DecompositionLevel {
+        self.decomposition_level
+    }
+
+    pub fn data(self) -> C {
+        self.data
+    }
+}
+
+impl<'a> FourierGgswCiphertextView<'a> {
+    /// Return an iterator over the level matrices.
+    pub fn into_levels(self) -> impl DoubleEndedIterator<Item = FourierGgswLevelMatrixView<'a>> {
+        self.fourier
+            .data
+            .split_into(self.decomposition_level_count.0)
+            .enumerate()
+            .map(move |(i, slice)| {
+                FourierGgswLevelMatrixView::new(
+                    slice,
+                    self.glwe_size,
+                    self.fourier.polynomial_size,
+                    DecompositionLevel(i + 1),
+                )
+            })
+    }
+}
+
+impl FourierGgswCiphertext<ABox<[c64]>> {
+    pub fn new(
+        glwe_size: GlweSize,
+        polynomial_size: PolynomialSize,
+        decomposition_base_log: DecompositionBaseLog,
+        decomposition_level_count: DecompositionLevelCount,
+    ) -> FourierGgswCiphertext<ABox<[c64]>> {
+        let boxed = avec![
+            c64::default();
+            polynomial_size.to_fourier_polynomial_size().0
+                * glwe_size.0
+                * glwe_size.0
+                * decomposition_level_count.0
+        ]
+        .into_boxed_slice();
+
+        FourierGgswCiphertext::from_container(
+            boxed,
+            glwe_size,
+            polynomial_size,
+            decomposition_base_log,
+            decomposition_level_count,
+        )
+    }
+}
+
+#[derive(PartialEq, Eq, Debug, Clone, Copy)]
+pub struct FourierGgswCiphertextList<C: Container<Element = c64>> {
+    fourier: FourierPolynomialList<C>,
+    glwe_size: GlweSize,
+    decomposition_level_count: DecompositionLevelCount,
+    decomposition_base_log: DecompositionBaseLog,
+    count: usize,
+}
+
+pub type FourierGgswCiphertextListView<'a> = FourierGgswCiphertextList<&'a [c64]>;
+pub type FourierGgswCiphertextListMutView<'a> = FourierGgswCiphertextList<&'a mut [c64]>;
+
+impl<C: Container<Element = c64>> FourierGgswCiphertextList<C> {
+    pub fn new(
+        data: C,
+        count: usize,
+        glwe_size: GlweSize,
+        polynomial_size: PolynomialSize,
+        decomposition_base_log: DecompositionBaseLog,
+        decomposition_level_count: DecompositionLevelCount,
+    ) -> Self {
+        assert_eq!(
+            data.container_len(),
+            count
+                * polynomial_size.to_fourier_polynomial_size().0
+                * glwe_size.0
+                * glwe_size.0
+                * decomposition_level_count.0
+        );
+
+        Self {
+            fourier: FourierPolynomialList {
+                data,
+                polynomial_size,
+            },
+            count,
+            glwe_size,
+            decomposition_level_count,
+            decomposition_base_log,
+        }
+    }
+
+    pub fn data(self) -> C {
+        self.fourier.data
+    }
+
+    pub fn polynomial_size(&self) -> PolynomialSize {
+        self.fourier.polynomial_size
+    }
+
+    pub fn count(&self) -> usize {
+        self.count
+    }
+
+    pub fn glwe_size(&self) -> GlweSize {
+        self.glwe_size
+    }
+
+    pub fn decomposition_level_count(&self) -> DecompositionLevelCount {
+        self.decomposition_level_count
+    }
+
+    pub fn decomposition_base_log(&self) -> DecompositionBaseLog {
+        self.decomposition_base_log
+    }
+
+    pub fn as_view(&self) -> FourierGgswCiphertextListView<'_> {
+        let fourier = FourierPolynomialList {
+            data: self.fourier.data.as_ref(),
+            polynomial_size: self.fourier.polynomial_size,
+        };
+        FourierGgswCiphertextListView {
+            fourier,
+            count: self.count,
+            glwe_size: self.glwe_size,
+            decomposition_level_count: self.decomposition_level_count,
+            decomposition_base_log: self.decomposition_base_log,
+        }
+    }
+
+    pub fn as_mut_view(&mut self) -> FourierGgswCiphertextListMutView<'_>
+    where
+        C: AsMut<[c64]>,
+    {
+        let fourier = FourierPolynomialList {
+            data: self.fourier.data.as_mut(),
+            polynomial_size: self.fourier.polynomial_size,
+        };
+        FourierGgswCiphertextListMutView {
+            fourier,
+            count: self.count,
+            glwe_size: self.glwe_size,
+            decomposition_level_count: self.decomposition_level_count,
+            decomposition_base_log: self.decomposition_base_log,
+        }
+    }
+
+    pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = FourierGgswCiphertext<C>>
+    where
+        C: Split,
+    {
+        self.fourier.data.split_into(self.count).map(move |slice| {
+            FourierGgswCiphertext::from_container(
+                slice,
+                self.glwe_size,
+                self.fourier.polynomial_size,
+                self.decomposition_base_log,
+                self.decomposition_level_count,
+            )
+        })
+    }
+
+    pub fn split_at(self, mid: usize) -> (Self, Self)
+    where
+        C: Split,
+    {
+        let polynomial_size = self.fourier.polynomial_size;
+        let glwe_size = self.glwe_size;
+        let decomposition_level_count = self.decomposition_level_count;
+        let decomposition_base_log = self.decomposition_base_log;
+
+        let (left, right) = self.fourier.data.split_at(
+            mid * polynomial_size.to_fourier_polynomial_size().0
+                * glwe_size.0
+                * glwe_size.0
+                * decomposition_level_count.0,
+        );
+        (
+            Self::new(
+                left,
+                mid,
+                glwe_size,
+                polynomial_size,
+                decomposition_base_log,
+                decomposition_level_count,
+            ),
+            Self::new(
+                right,
+                self.count - mid,
+                glwe_size,
+                polynomial_size,
+                decomposition_base_log,
+                decomposition_level_count,
+            ),
+        )
+    }
+}
--- a/tfhe/src/core_crypto/entities/fourier_lwe_bootstrap_key.rs
+++ b/tfhe/src/core_crypto/entities/fourier_lwe_bootstrap_key.rs
@@ -1,25 +1,29 @@
-use super::super::math::fft::{Fft, FftView, FourierPolynomialList};
-use super::ggsw::*;
-use crate::core_crypto::algorithms::extract_lwe_sample_from_glwe_ciphertext;
-use crate::core_crypto::algorithms::polynomial_algorithms::*;
-use crate::core_crypto::commons::math::decomposition::SignedDecomposer;
-use crate::core_crypto::commons::math::torus::UnsignedTorus;
-use crate::core_crypto::commons::numeric::CastInto;
+use super::fourier_ggsw_ciphertext::FourierGgswCiphertext;
+use super::fourier_polynomial_list::FourierPolynomialList;
+use super::lwe_bootstrap_key::LweBootstrapKey;
+use crate::core_crypto::algorithms::ggsw_conversion::convert_standard_ggsw_ciphertext_to_fourier_mem_optimized;
+use crate::core_crypto::commons::math::fft64::{par_convert_polynomials_list_to_fourier, FftView};
 use crate::core_crypto::commons::parameters::{
-    DecompositionBaseLog, DecompositionLevelCount, GlweSize, LutCountLog, LweDimension,
-    ModulusSwitchOffset, MonomialDegree, PolynomialSize,
+    DecompositionBaseLog, DecompositionLevelCount, GlweSize, LweDimension, PolynomialSize,
 };
-use crate::core_crypto::commons::traits::{
-    Container, ContiguousEntityContainer, ContiguousEntityContainerMut, IntoContainerOwned, Split,
+use crate::core_crypto::commons::traits::contiguous_entity_container::{
+    ContiguousEntityContainer, ContiguousEntityContainerMut,
 };
+use crate::core_crypto::commons::traits::{Container, ContainerMut, Split};
 use crate::core_crypto::commons::utils::izip;
-use crate::core_crypto::entities::*;
-use crate::core_crypto::fft_impl::common::{pbs_modulus_switch, FourierBootstrapKey};
-use crate::core_crypto::fft_impl::fft64::math::fft::par_convert_polynomials_list_to_fourier;
-use crate::core_crypto::prelude::ContainerMut;
+use crate::core_crypto::fft_impl::common::pbs_modulus_switch;
+use crate::core_crypto::prelude::polynomial_algorithms::{
+    polynomial_wrapping_monic_monomial_div, polynomial_wrapping_monic_monomial_mul_and_subtract,
+};
+use crate::core_crypto::prelude::{
+    add_external_product_assign_mem_optimized, extract_lwe_sample_from_glwe_ciphertext, CastInto,
+    GlweCiphertextMutView, GlweCiphertextView, IntoContainerOwned, LutCountLog,
+    LweCiphertextMutView, LweCiphertextView, ModulusSwitchOffset, MonomialDegree, Polynomial,
+    SignedDecomposer, UnsignedTorus,
+};
 use aligned_vec::{avec, ABox, CACHELINE_ALIGN};
 use concrete_fft::c64;
-use dyn_stack::{PodStack, ReborrowMut, SizeOverflow, StackReq};
+use dyn_stack::{PodStack, ReborrowMut};

 #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(bound(deserialize = "C: IntoContainerOwned"))]
@@ -33,6 +37,7 @@ pub struct FourierLweBootstrapKey<C: Container<Element = c64>> {

 pub type FourierLweBootstrapKeyView<'a> = FourierLweBootstrapKey<&'a [c64]>;
 pub type FourierLweBootstrapKeyMutView<'a> = FourierLweBootstrapKey<&'a mut [c64]>;
+pub type FourierLweBootstrapKeyOwned = FourierLweBootstrapKey<ABox<[c64]>>;

 impl<C: Container<Element = c64>> FourierLweBootstrapKey<C> {
    pub fn from_container(
@@ -103,7 +108,9 @@ impl<C: Container<Element = c64>> FourierLweBootstrapKey<C> {
    }

    pub fn output_lwe_dimension(&self) -> LweDimension {
-        LweDimension((self.glwe_size.0 - 1) * self.polynomial_size().0)
+        self.glwe_size
+            .to_glwe_dimension()
+            .to_equivalent_lwe_dimension(self.polynomial_size())
    }

    pub fn data(self) -> C {
@@ -125,7 +132,7 @@ impl<C: Container<Element = c64>> FourierLweBootstrapKey<C> {

    pub fn as_mut_view(&mut self) -> FourierLweBootstrapKeyMutView<'_>
    where
-        C: AsMut<[c64]>,
+        C: ContainerMut<Element = c64>,
    {
        FourierLweBootstrapKeyMutView {
            fourier: FourierPolynomialList {
@@ -140,9 +147,7 @@ impl<C: Container<Element = c64>> FourierLweBootstrapKey<C> {
    }
 }

-pub type FourierLweBootstrapKeyOwned = FourierLweBootstrapKey<ABox<[c64]>>;
-
-impl FourierLweBootstrapKey<ABox<[c64]>> {
+impl FourierLweBootstrapKeyOwned {
    pub fn new(
        input_lwe_dimension: LweDimension,
        glwe_size: GlweSize,
@@ -171,11 +176,6 @@ impl FourierLweBootstrapKey<ABox<[c64]>> {
    }
 }

-/// Return the required memory for [`FourierLweBootstrapKeyMutView::fill_with_forward_fourier`].
-pub fn fill_with_forward_fourier_scratch(fft: FftView<'_>) -> Result<StackReq, SizeOverflow> {
-    fft.forward_scratch()
-}
-
 impl<'a> FourierLweBootstrapKeyMutView<'a> {
    /// Fill a bootstrapping key with the Fourier transform of a bootstrapping key in the standard
    /// domain.
@@ -185,10 +185,15 @@ impl<'a> FourierLweBootstrapKeyMutView<'a> {
        fft: FftView<'_>,
        mut stack: PodStack<'_>,
    ) {
-        for (fourier_ggsw, standard_ggsw) in
+        for (mut fourier_ggsw, standard_ggsw) in
            izip!(self.as_mut_view().into_ggsw_iter(), coef_bsk.iter())
        {
-            fourier_ggsw.fill_with_forward_fourier(standard_ggsw, fft, stack.rb_mut());
+            convert_standard_ggsw_ciphertext_to_fourier_mem_optimized(
+                &standard_ggsw,
+                &mut fourier_ggsw,
+                fft,
+                stack.rb_mut(),
+            );
        }
    }
    /// Fill a bootstrapping key with the Fourier transform of a bootstrapping key in the standard
@@ -198,7 +203,7 @@ impl<'a> FourierLweBootstrapKeyMutView<'a> {
        coef_bsk: LweBootstrapKey<&'_ [Scalar]>,
        fft: FftView<'_>,
    ) {
-        let polynomial_size = self.fourier.polynomial_size;
+        let polynomial_size = self.polynomial_size();
        par_convert_polynomials_list_to_fourier(
            self.data(),
            coef_bsk.into_container(),
@@ -208,35 +213,6 @@ impl<'a> FourierLweBootstrapKeyMutView<'a> {
    }
 }

-/// Return the required memory for [`FourierLweBootstrapKeyView::blind_rotate_assign`].
-pub fn blind_rotate_scratch<Scalar>(
-    glwe_size: GlweSize,
-    polynomial_size: PolynomialSize,
-    fft: FftView<'_>,
-) -> Result<StackReq, SizeOverflow> {
-    StackReq::try_any_of([
-        // tmp_poly allocation
-        StackReq::try_new_aligned::<Scalar>(polynomial_size.0, CACHELINE_ALIGN)?,
-        StackReq::try_all_of([
-            // ct1 allocation
-            StackReq::try_new_aligned::<Scalar>(glwe_size.0 * polynomial_size.0, CACHELINE_ALIGN)?,
-            // external product
-            add_external_product_assign_scratch::<Scalar>(glwe_size, polynomial_size, fft)?,
-        ])?,
-    ])
-}
-
-/// Return the required memory for [`FourierLweBootstrapKeyView::bootstrap`].
-pub fn bootstrap_scratch<Scalar>(
-    glwe_size: GlweSize,
-    polynomial_size: PolynomialSize,
-    fft: FftView<'_>,
-) -> Result<StackReq, SizeOverflow> {
-    blind_rotate_scratch::<Scalar>(glwe_size, polynomial_size, fft)?.try_and(
-        StackReq::try_new_aligned::<Scalar>(glwe_size.0 * polynomial_size.0, CACHELINE_ALIGN)?,
-    )
-}
-
 impl<'a> FourierLweBootstrapKeyView<'a> {
    // CastInto required for PBS modulus switch which returns a usize
    pub fn blind_rotate_assign<Scalar: UnsignedTorus + CastInto<usize>>(
@@ -303,12 +279,11 @@ impl<'a> FourierLweBootstrapKeyView<'a> {
                    );
                }

-                // as_mut_view is required to keep borrow rules consistent
                // second step of cmux
-                add_external_product_assign(
-                    ct0.as_mut_view(),
-                    bootstrap_key_ggsw,
-                    ct1.as_mut_view(),
+                add_external_product_assign_mem_optimized(
+                    &mut ct0,
+                    &bootstrap_key_ggsw,
+                    &ct1,
                    fft,
                    stack.rb_mut(),
                );
@@ -363,75 +338,3 @@ impl<'a> FourierLweBootstrapKeyView<'a> {
        );
    }
 }
-
-impl<Scalar> FourierBootstrapKey<Scalar> for FourierLweBootstrapKeyOwned
-where
-    Scalar: UnsignedTorus + CastInto<usize>,
-{
-    type Fft = Fft;
-
-    fn new_fft(polynomial_size: PolynomialSize) -> Self::Fft {
-        Fft::new(polynomial_size)
-    }
-
-    fn new(
-        input_lwe_dimension: LweDimension,
-        polynomial_size: PolynomialSize,
-        glwe_size: GlweSize,
-        decomposition_base_log: DecompositionBaseLog,
-        decomposition_level_count: DecompositionLevelCount,
-    ) -> Self {
-        Self::new(
-            input_lwe_dimension,
-            glwe_size,
-            polynomial_size,
-            decomposition_base_log,
-            decomposition_level_count,
-        )
-    }
-
-    fn fill_with_forward_fourier_scratch(fft: &Self::Fft) -> Result<StackReq, SizeOverflow> {
-        fill_with_forward_fourier_scratch(fft.as_view())
-    }
-
-    fn fill_with_forward_fourier<ContBsk>(
-        &mut self,
-        coef_bsk: &LweBootstrapKey<ContBsk>,
-        fft: &Self::Fft,
-        stack: PodStack<'_>,
-    ) where
-        ContBsk: Container<Element = Scalar>,
-    {
-        self.as_mut_view()
-            .fill_with_forward_fourier(coef_bsk.as_view(), fft.as_view(), stack);
-    }
-
-    fn bootstrap_scratch(
-        glwe_size: GlweSize,
-        polynomial_size: PolynomialSize,
-        fft: &Self::Fft,
-    ) -> Result<StackReq, SizeOverflow> {
-        bootstrap_scratch::<Scalar>(glwe_size, polynomial_size, fft.as_view())
-    }
-
-    fn bootstrap<ContLweOut, ContLweIn, ContAcc>(
-        &self,
-        lwe_out: &mut LweCiphertext<ContLweOut>,
-        lwe_in: &LweCiphertext<ContLweIn>,
-        accumulator: &GlweCiphertext<ContAcc>,
-        fft: &Self::Fft,
-        stack: PodStack<'_>,
-    ) where
-        ContLweOut: ContainerMut<Element = Scalar>,
-        ContLweIn: Container<Element = Scalar>,
-        ContAcc: Container<Element = Scalar>,
-    {
-        self.as_view().bootstrap(
-            lwe_out.as_mut_view(),
-            lwe_in.as_view(),
-            accumulator.as_view(),
-            fft.as_view(),
-            stack,
-        )
-    }
-}
--- a/tfhe/src/core_crypto/entities/fourier_polynomial.rs
+++ b/tfhe/src/core_crypto/entities/fourier_polynomial.rs
@@ -0,0 +1,99 @@
+use crate::core_crypto::commons::parameters::*;
+use crate::core_crypto::commons::traits::*;
+use aligned_vec::{avec, ABox};
+use concrete_fft::c64;
+
+//--------------------------------------------------------------------------------
+// Structure definitions
+//--------------------------------------------------------------------------------
+
+/// Polynomial in the Fourier domain.
+///
+/// # Note
+///
+/// Polynomials in the Fourier domain have half the size of the corresponding polynomials in
+/// the standard domain.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct FourierPolynomial<C: Container> {
+    data: C,
+}
+
+pub type FourierPolynomialView<'a> = FourierPolynomial<&'a [c64]>;
+pub type FourierPolynomialMutView<'a> = FourierPolynomial<&'a mut [c64]>;
+pub type FourierPolynomialOwned = FourierPolynomial<ABox<[c64]>>;
+
+impl FourierPolynomialOwned {
+    pub fn new(polynomial_size: PolynomialSize) -> FourierPolynomial<ABox<[c64]>> {
+        let boxed = avec![
+            c64::default();
+            polynomial_size.to_fourier_polynomial_size().0
+        ]
+        .into_boxed_slice();
+
+        FourierPolynomial { data: boxed }
+    }
+}
+
+impl<T, C: Container<Element = T>> AsRef<[T]> for FourierPolynomial<C> {
+    fn as_ref(&self) -> &[T] {
+        self.data.as_ref()
+    }
+}
+
+impl<T, C: ContainerMut<Element = T>> AsMut<[T]> for FourierPolynomial<C> {
+    fn as_mut(&mut self) -> &mut [T] {
+        self.data.as_mut()
+    }
+}
+
+impl<C: Container<Element = c64>> FourierPolynomial<C> {
+    /// Create a [`FourierPolynomial`] from an existing container.
+    ///
+    /// # Note
+    ///
+    /// This function only wraps a container in the appropriate type.
+    pub fn from_container(container: C) -> FourierPolynomial<C> {
+        assert!(
+            container.container_len() > 0,
+            "Got an empty container to create a Polynomial"
+        );
+        FourierPolynomial { data: container }
+    }
+
+    /// Return the [`PolynomialSize`] of the [`FourierPolynomial`].
+    pub fn polynomial_size(&self) -> PolynomialSize {
+        FourierPolynomialSize(self.data.container_len()).to_standard_polynomial_size()
+    }
+
+    /// Consume the entity and return its underlying container.
+    pub fn into_container(self) -> C {
+        self.data
+    }
+
+    /// Return a view of the [`FourierPolynomial`]. This is useful if an algorithm takes a view by
+    /// value.
+    pub fn as_view(&self) -> FourierPolynomialView<'_> {
+        FourierPolynomialView::from_container(self.as_ref())
+    }
+}
+
+impl<C: ContainerMut<Element = c64>> FourierPolynomial<C> {
+    /// Return a view of the [`FourierPolynomial`]. This is useful if an algorithm takes a view by
+    /// value.
+    pub fn as_mut_view(&mut self) -> FourierPolynomialMutView<'_> {
+        FourierPolynomialMutView::from_container(self.as_mut())
+    }
+}
+
+/// Metadata used in the [`CreateFrom`] implementation to create [`FourierPolynomial`] entities.
+#[derive(Clone, Copy)]
+pub struct FourierPolynomialCreationMetadata();
+
+impl<C: Container<Element = c64>> CreateFrom<C> for FourierPolynomial<C> {
+    type Metadata = FourierPolynomialCreationMetadata;
+
+    #[inline]
+    fn create_from(from: C, _: Self::Metadata) -> FourierPolynomial<C> {
+        FourierPolynomial::from_container(from)
+    }
+}
--- a/tfhe/src/core_crypto/entities/fourier_polynomial_list.rs
+++ b/tfhe/src/core_crypto/entities/fourier_polynomial_list.rs
@@ -0,0 +1,27 @@
+use crate::core_crypto::commons::parameters::{PolynomialCount, PolynomialSize};
+use crate::core_crypto::commons::traits::Container;
+use crate::core_crypto::entities::*;
+use crate::core_crypto::prelude::Split;
+use concrete_fft::c64;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct FourierPolynomialList<C: Container> {
+    pub data: C,
+    pub polynomial_size: PolynomialSize,
+}
+
+impl<C: Container> FourierPolynomialList<C> {
+    pub fn polynomial_count(&self) -> PolynomialCount {
+        PolynomialCount(
+            self.data.container_len() / self.polynomial_size.to_fourier_polynomial_size().0,
+        )
+    }
+}
+
+impl<C: Container<Element = c64> + Split> FourierPolynomialList<C> {
+    pub fn into_polynomial_iter(self) -> impl DoubleEndedIterator<Item = FourierPolynomial<C>> {
+        self.data
+            .into_chunks(self.polynomial_size.to_fourier_polynomial_size().0)
+            .map(FourierPolynomial::from_container)
+    }
+}
--- a/tfhe/src/core_crypto/entities/lwe_multi_bit_bootstrap_key.rs
+++ b/tfhe/src/core_crypto/entities/lwe_multi_bit_bootstrap_key.rs
@@ -3,7 +3,6 @@
 use crate::core_crypto::commons::parameters::*;
 use crate::core_crypto::commons::traits::*;
 use crate::core_crypto::entities::*;
-use crate::core_crypto::fft_impl::fft64::math::fft::FourierPolynomialList;
 use aligned_vec::{avec, ABox};
 use concrete_fft::c64;

--- a/tfhe/src/core_crypto/entities/mod.rs
+++ b/tfhe/src/core_crypto/entities/mod.rs
@@ -4,6 +4,10 @@
 //! associated to the object, e.g., `get_mask` for the entity `LweCiphertext`.

 pub mod cleartext;
+pub mod fourier_ggsw_ciphertext;
+pub mod fourier_lwe_bootstrap_key;
+pub mod fourier_polynomial;
+pub mod fourier_polynomial_list;
 pub mod ggsw_ciphertext;
 pub mod ggsw_ciphertext_list;
 pub mod glwe_ciphertext;
@@ -45,14 +49,11 @@ pub use crate::core_crypto::fft_impl::fft128::crypto::bootstrap::{
 pub use crate::core_crypto::fft_impl::fft128::crypto::ggsw::{
    Fourier128GgswCiphertext, Fourier128GgswLevelMatrix, Fourier128GgswLevelRow,
 };
-pub use crate::core_crypto::fft_impl::fft64::crypto::bootstrap::{
-    FourierLweBootstrapKey, FourierLweBootstrapKeyOwned,
-};
-pub use crate::core_crypto::fft_impl::fft64::crypto::ggsw::{
-    FourierGgswCiphertext, FourierGgswCiphertextList, FourierGgswLevelMatrix, FourierGgswLevelRow,
-};
-pub use crate::core_crypto::fft_impl::fft64::math::polynomial::FourierPolynomial;
 pub use cleartext::*;
+pub use fourier_ggsw_ciphertext::*;
+pub use fourier_lwe_bootstrap_key::*;
+pub use fourier_polynomial::*;
+pub use fourier_polynomial_list::*;
 pub use ggsw_ciphertext::*;
 pub use ggsw_ciphertext_list::*;
 pub use glwe_ciphertext::*;
--- a/tfhe/src/core_crypto/fft_impl/fft128/crypto/ggsw.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128/crypto/ggsw.rs
@@ -1,5 +1,7 @@
 use super::super::math::fft::Fft128View;
-use crate::core_crypto::commons::math::decomposition::{DecompositionLevel, SignedDecomposer};
+use crate::core_crypto::commons::math::decomposition::{
+    DecompositionLevel, SignedDecomposer, TensorSignedDecompositionLendingIter,
+};
 use crate::core_crypto::commons::math::torus::UnsignedTorus;
 use crate::core_crypto::commons::parameters::{
    DecompositionBaseLog, DecompositionLevelCount, GlweSize, PolynomialSize,
@@ -9,7 +11,6 @@ use crate::core_crypto::commons::traits::{
 };
 use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::entities::*;
-use crate::core_crypto::fft_impl::fft64::math::decomposition::TensorSignedDecompositionLendingIter;
 use crate::core_crypto::prelude::ContainerMut;
 use aligned_vec::CACHELINE_ALIGN;
 use concrete_fft::fft128::f128;
@@ -412,21 +413,30 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
            // ------------------------------------------------------ EXTERNAL PRODUCT IN FOURIER
            // DOMAIN In this section, we perform the external product in the fourier
            // domain, and accumulate the result in the output_fft_buffer variable.
-            let (mut decomposition, mut substack1) = TensorSignedDecompositionLendingIter::new(
+            let glwe_len = glwe.as_ref().len();
+            let (mut states, mut substack1) = substack0
+                .rb_mut()
+                .make_aligned_raw::<Scalar>(glwe_len, CACHELINE_ALIGN);
+            let mut decomposition = TensorSignedDecompositionLendingIter::new(
                glwe.as_ref()
                    .iter()
                    .map(|s| decomposer.closest_representable(*s)),
                DecompositionBaseLog(decomposer.base_log),
                DecompositionLevelCount(decomposer.level_count),
-                substack0.rb_mut(),
+                &mut states,
            );

            // We loop through the levels (we reverse to match the order of the decomposition
            // iterator.)
            for ggsw_decomp_matrix in ggsw.into_levels().rev() {
                // We retrieve the decomposition of this level.
-                let (glwe_level, glwe_decomp_term, mut substack2) =
-                    collect_next_term(&mut decomposition, &mut substack1, align);
+                let (mut glwe_decomp_term, mut substack2) = substack1
+                    .rb_mut()
+                    .make_aligned_raw::<Scalar>(glwe_len, CACHELINE_ALIGN);
+                let glwe_level = decomposition
+                    .fill_next_term(&mut glwe_decomp_term)
+                    .unwrap()
+                    .0;
                let glwe_decomp_term = GlweCiphertextView::from_container(
                    &*glwe_decomp_term,
                    ggsw.polynomial_size(),
@@ -525,20 +535,6 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
    )
 }

-fn collect_next_term<'a, Scalar: UnsignedTorus>(
-    decomposition: &mut TensorSignedDecompositionLendingIter<'_, Scalar>,
-    substack1: &'a mut PodStack,
-    align: usize,
-) -> (
-    DecompositionLevel,
-    dyn_stack::DynArray<'a, Scalar>,
-    PodStack<'a>,
-) {
-    let (glwe_level, _, glwe_decomp_term) = decomposition.next_term().unwrap();
-    let (glwe_decomp_term, substack2) = substack1.rb_mut().collect_aligned(align, glwe_decomp_term);
-    (glwe_level, glwe_decomp_term, substack2)
-}
-
 /// # Note
 ///
 /// this function leaves all the elements of `output_fourier` in an initialized state.
--- a/tfhe/src/core_crypto/fft_impl/fft64/crypto/ggsw.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/crypto/ggsw.rs
@@ -1,778 +0,0 @@
-use super::super::math::decomposition::TensorSignedDecompositionLendingIter;
-use super::super::math::fft::{FftView, FourierPolynomialList};
-use super::super::math::polynomial::FourierPolynomialMutView;
-use crate::core_crypto::commons::math::decomposition::{DecompositionLevel, SignedDecomposer};
-use crate::core_crypto::commons::math::torus::UnsignedTorus;
-use crate::core_crypto::commons::parameters::{
-    DecompositionBaseLog, DecompositionLevelCount, GlweSize, PolynomialSize,
-};
-use crate::core_crypto::commons::traits::{
-    Container, ContiguousEntityContainer, ContiguousEntityContainerMut, IntoContainerOwned, Split,
-};
-use crate::core_crypto::commons::utils::izip;
-use crate::core_crypto::entities::*;
-use aligned_vec::{avec, ABox, CACHELINE_ALIGN};
-use concrete_fft::c64;
-use dyn_stack::{PodStack, ReborrowMut, SizeOverflow, StackReq};
-
-/// A GGSW ciphertext in the Fourier domain.
-#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-#[serde(bound(deserialize = "C: IntoContainerOwned"))]
-pub struct FourierGgswCiphertext<C: Container<Element = c64>> {
-    fourier: FourierPolynomialList<C>,
-    glwe_size: GlweSize,
-    decomposition_base_log: DecompositionBaseLog,
-    decomposition_level_count: DecompositionLevelCount,
-}
-
-/// A matrix containing a single level of gadget decomposition, in the Fourier domain.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub struct FourierGgswLevelMatrix<C: Container<Element = c64>> {
-    data: C,
-    glwe_size: GlweSize,
-    polynomial_size: PolynomialSize,
-    row_count: usize,
-    decomposition_level: DecompositionLevel,
-}
-
-/// A row of a GGSW level matrix, in the Fourier domain.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub struct FourierGgswLevelRow<C: Container<Element = c64>> {
-    data: C,
-    glwe_size: GlweSize,
-    polynomial_size: PolynomialSize,
-    decomposition_level: DecompositionLevel,
-}
-
-pub type FourierGgswCiphertextView<'a> = FourierGgswCiphertext<&'a [c64]>;
-pub type FourierGgswCiphertextMutView<'a> = FourierGgswCiphertext<&'a mut [c64]>;
-pub type FourierGgswLevelMatrixView<'a> = FourierGgswLevelMatrix<&'a [c64]>;
-pub type FourierGgswLevelMatrixMutView<'a> = FourierGgswLevelMatrix<&'a mut [c64]>;
-pub type FourierGgswLevelRowView<'a> = FourierGgswLevelRow<&'a [c64]>;
-pub type FourierGgswLevelRowMutView<'a> = FourierGgswLevelRow<&'a mut [c64]>;
-
-impl<C: Container<Element = c64>> FourierGgswCiphertext<C> {
-    pub fn from_container(
-        data: C,
-        glwe_size: GlweSize,
-        polynomial_size: PolynomialSize,
-        decomposition_base_log: DecompositionBaseLog,
-        decomposition_level_count: DecompositionLevelCount,
-    ) -> Self {
-        assert_eq!(
-            data.container_len(),
-            polynomial_size.to_fourier_polynomial_size().0
-                * glwe_size.0
-                * glwe_size.0
-                * decomposition_level_count.0
-        );
-
-        Self {
-            fourier: FourierPolynomialList {
-                data,
-                polynomial_size,
-            },
-            glwe_size,
-            decomposition_base_log,
-            decomposition_level_count,
-        }
-    }
-
-    pub fn polynomial_size(&self) -> PolynomialSize {
-        self.fourier.polynomial_size
-    }
-
-    pub fn glwe_size(&self) -> GlweSize {
-        self.glwe_size
-    }
-
-    pub fn decomposition_base_log(&self) -> DecompositionBaseLog {
-        self.decomposition_base_log
-    }
-
-    pub fn decomposition_level_count(&self) -> DecompositionLevelCount {
-        self.decomposition_level_count
-    }
-
-    pub fn data(self) -> C {
-        self.fourier.data
-    }
-
-    pub fn as_view(&self) -> FourierGgswCiphertextView<'_>
-    where
-        C: AsRef<[c64]>,
-    {
-        FourierGgswCiphertextView {
-            fourier: FourierPolynomialList {
-                data: self.fourier.data.as_ref(),
-                polynomial_size: self.fourier.polynomial_size,
-            },
-            glwe_size: self.glwe_size,
-            decomposition_base_log: self.decomposition_base_log,
-            decomposition_level_count: self.decomposition_level_count,
-        }
-    }
-
-    pub fn as_mut_view(&mut self) -> FourierGgswCiphertextMutView<'_>
-    where
-        C: AsMut<[c64]>,
-    {
-        FourierGgswCiphertextMutView {
-            fourier: FourierPolynomialList {
-                data: self.fourier.data.as_mut(),
-                polynomial_size: self.fourier.polynomial_size,
-            },
-            glwe_size: self.glwe_size,
-            decomposition_base_log: self.decomposition_base_log,
-            decomposition_level_count: self.decomposition_level_count,
-        }
-    }
-}
-
-impl<C: Container<Element = c64>> FourierGgswLevelMatrix<C> {
-    pub fn new(
-        data: C,
-        glwe_size: GlweSize,
-        polynomial_size: PolynomialSize,
-        row_count: usize,
-        decomposition_level: DecompositionLevel,
-    ) -> Self {
-        assert_eq!(
-            data.container_len(),
-            polynomial_size.to_fourier_polynomial_size().0 * glwe_size.0 * row_count
-        );
-        Self {
-            data,
-            polynomial_size,
-            glwe_size,
-            row_count,
-            decomposition_level,
-        }
-    }
-
-    /// Return an iterator over the rows of the level matrices.
-    pub fn into_rows(self) -> impl DoubleEndedIterator<Item = FourierGgswLevelRow<C>>
-    where
-        C: Split,
-    {
-        self.data
-            .split_into(self.row_count)
-            .map(move |slice| FourierGgswLevelRow {
-                data: slice,
-                polynomial_size: self.polynomial_size,
-                glwe_size: self.glwe_size,
-                decomposition_level: self.decomposition_level,
-            })
-    }
-
-    pub fn polynomial_size(&self) -> PolynomialSize {
-        self.polynomial_size
-    }
-
-    pub fn glwe_size(&self) -> GlweSize {
-        self.glwe_size
-    }
-
-    pub fn row_count(&self) -> usize {
-        self.row_count
-    }
-
-    pub fn decomposition_level(&self) -> DecompositionLevel {
-        self.decomposition_level
-    }
-
-    pub fn data(self) -> C {
-        self.data
-    }
-}
-
-impl<C: Container<Element = c64>> FourierGgswLevelRow<C> {
-    pub fn new(
-        data: C,
-        glwe_size: GlweSize,
-        polynomial_size: PolynomialSize,
-        decomposition_level: DecompositionLevel,
-    ) -> Self {
-        assert_eq!(
-            data.container_len(),
-            polynomial_size.to_fourier_polynomial_size().0 * glwe_size.0
-        );
-        Self {
-            data,
-            polynomial_size,
-            glwe_size,
-            decomposition_level,
-        }
-    }
-
-    pub fn polynomial_size(&self) -> PolynomialSize {
-        self.polynomial_size
-    }
-
-    pub fn glwe_size(&self) -> GlweSize {
-        self.glwe_size
-    }
-
-    pub fn decomposition_level(&self) -> DecompositionLevel {
-        self.decomposition_level
-    }
-
-    pub fn data(self) -> C {
-        self.data
-    }
-}
-
-impl<'a> FourierGgswCiphertextView<'a> {
-    /// Return an iterator over the level matrices.
-    pub fn into_levels(self) -> impl DoubleEndedIterator<Item = FourierGgswLevelMatrixView<'a>> {
-        self.fourier
-            .data
-            .split_into(self.decomposition_level_count.0)
-            .enumerate()
-            .map(move |(i, slice)| {
-                FourierGgswLevelMatrixView::new(
-                    slice,
-                    self.glwe_size,
-                    self.fourier.polynomial_size,
-                    self.glwe_size.0,
-                    DecompositionLevel(i + 1),
-                )
-            })
-    }
-}
-
-/// Return the required memory for [`FourierGgswCiphertextMutView::fill_with_forward_fourier`].
-pub fn fill_with_forward_fourier_scratch(fft: FftView<'_>) -> Result<StackReq, SizeOverflow> {
-    fft.forward_scratch()
-}
-
-impl<'a> FourierGgswCiphertextMutView<'a> {
-    /// Fill a GGSW ciphertext with the Fourier transform of a GGSW ciphertext in the standard
-    /// domain.
-    pub fn fill_with_forward_fourier<Scalar: UnsignedTorus>(
-        self,
-        coef_ggsw: GgswCiphertextView<'_, Scalar>,
-        fft: FftView<'_>,
-        mut stack: PodStack<'_>,
-    ) {
-        debug_assert_eq!(coef_ggsw.polynomial_size(), self.polynomial_size());
-        let fourier_poly_size = coef_ggsw.polynomial_size().to_fourier_polynomial_size().0;
-
-        for (fourier_poly, coef_poly) in izip!(
-            self.data().into_chunks(fourier_poly_size),
-            coef_ggsw.as_polynomial_list().iter()
-        ) {
-            fft.forward_as_torus(
-                FourierPolynomialMutView { data: fourier_poly },
-                coef_poly,
-                stack.rb_mut(),
-            );
-        }
-    }
-}
-
-#[allow(unused)]
-type FourierGgswCiphertextOwned = FourierGgswCiphertext<ABox<[c64]>>;
-
-impl FourierGgswCiphertext<ABox<[c64]>> {
-    pub fn new(
-        glwe_size: GlweSize,
-        polynomial_size: PolynomialSize,
-        decomposition_base_log: DecompositionBaseLog,
-        decomposition_level_count: DecompositionLevelCount,
-    ) -> FourierGgswCiphertext<ABox<[c64]>> {
-        let boxed = avec![
-            c64::default();
-            polynomial_size.to_fourier_polynomial_size().0
-                * glwe_size.0
-                * glwe_size.0
-                * decomposition_level_count.0
-        ]
-        .into_boxed_slice();
-
-        FourierGgswCiphertext::from_container(
-            boxed,
-            glwe_size,
-            polynomial_size,
-            decomposition_base_log,
-            decomposition_level_count,
-        )
-    }
-}
-
-#[derive(PartialEq, Eq, Debug, Clone, Copy)]
-pub struct FourierGgswCiphertextList<C: Container<Element = c64>> {
-    fourier: FourierPolynomialList<C>,
-    glwe_size: GlweSize,
-    decomposition_level_count: DecompositionLevelCount,
-    decomposition_base_log: DecompositionBaseLog,
-    count: usize,
-}
-
-pub type FourierGgswCiphertextListView<'a> = FourierGgswCiphertextList<&'a [c64]>;
-pub type FourierGgswCiphertextListMutView<'a> = FourierGgswCiphertextList<&'a mut [c64]>;
-
-impl<C: Container<Element = c64>> FourierGgswCiphertextList<C> {
-    pub fn new(
-        data: C,
-        count: usize,
-        glwe_size: GlweSize,
-        polynomial_size: PolynomialSize,
-        decomposition_base_log: DecompositionBaseLog,
-        decomposition_level_count: DecompositionLevelCount,
-    ) -> Self {
-        assert_eq!(
-            data.container_len(),
-            count
-                * polynomial_size.to_fourier_polynomial_size().0
-                * glwe_size.0
-                * glwe_size.0
-                * decomposition_level_count.0
-        );
-
-        Self {
-            fourier: FourierPolynomialList {
-                data,
-                polynomial_size,
-            },
-            count,
-            glwe_size,
-            decomposition_level_count,
-            decomposition_base_log,
-        }
-    }
-
-    pub fn data(self) -> C {
-        self.fourier.data
-    }
-
-    pub fn polynomial_size(&self) -> PolynomialSize {
-        self.fourier.polynomial_size
-    }
-
-    pub fn count(&self) -> usize {
-        self.count
-    }
-
-    pub fn glwe_size(&self) -> GlweSize {
-        self.glwe_size
-    }
-
-    pub fn decomposition_level_count(&self) -> DecompositionLevelCount {
-        self.decomposition_level_count
-    }
-
-    pub fn decomposition_base_log(&self) -> DecompositionBaseLog {
-        self.decomposition_base_log
-    }
-
-    pub fn as_view(&self) -> FourierGgswCiphertextListView<'_> {
-        let fourier = FourierPolynomialList {
-            data: self.fourier.data.as_ref(),
-            polynomial_size: self.fourier.polynomial_size,
-        };
-        FourierGgswCiphertextListView {
-            fourier,
-            count: self.count,
-            glwe_size: self.glwe_size,
-            decomposition_level_count: self.decomposition_level_count,
-            decomposition_base_log: self.decomposition_base_log,
-        }
-    }
-
-    pub fn as_mut_view(&mut self) -> FourierGgswCiphertextListMutView<'_>
-    where
-        C: AsMut<[c64]>,
-    {
-        let fourier = FourierPolynomialList {
-            data: self.fourier.data.as_mut(),
-            polynomial_size: self.fourier.polynomial_size,
-        };
-        FourierGgswCiphertextListMutView {
-            fourier,
-            count: self.count,
-            glwe_size: self.glwe_size,
-            decomposition_level_count: self.decomposition_level_count,
-            decomposition_base_log: self.decomposition_base_log,
-        }
-    }
-
-    pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = FourierGgswCiphertext<C>>
-    where
-        C: Split,
-    {
-        self.fourier.data.split_into(self.count).map(move |slice| {
-            FourierGgswCiphertext::from_container(
-                slice,
-                self.glwe_size,
-                self.fourier.polynomial_size,
-                self.decomposition_base_log,
-                self.decomposition_level_count,
-            )
-        })
-    }
-
-    pub fn split_at(self, mid: usize) -> (Self, Self)
-    where
-        C: Split,
-    {
-        let polynomial_size = self.fourier.polynomial_size;
-        let glwe_size = self.glwe_size;
-        let decomposition_level_count = self.decomposition_level_count;
-        let decomposition_base_log = self.decomposition_base_log;
-
-        let (left, right) = self.fourier.data.split_at(
-            mid * polynomial_size.to_fourier_polynomial_size().0
-                * glwe_size.0
-                * glwe_size.0
-                * decomposition_level_count.0,
-        );
-        (
-            Self::new(
-                left,
-                mid,
-                glwe_size,
-                polynomial_size,
-                decomposition_base_log,
-                decomposition_level_count,
-            ),
-            Self::new(
-                right,
-                self.count - mid,
-                glwe_size,
-                polynomial_size,
-                decomposition_base_log,
-                decomposition_level_count,
-            ),
-        )
-    }
-}
-
-/// Return the required memory for [`add_external_product_assign`].
-pub fn add_external_product_assign_scratch<Scalar>(
-    glwe_size: GlweSize,
-    polynomial_size: PolynomialSize,
-    fft: FftView<'_>,
-) -> Result<StackReq, SizeOverflow> {
-    let align = CACHELINE_ALIGN;
-    let standard_scratch =
-        StackReq::try_new_aligned::<Scalar>(glwe_size.0 * polynomial_size.0, align)?;
-    let fourier_polynomial_size = polynomial_size.to_fourier_polynomial_size().0;
-    let fourier_scratch =
-        StackReq::try_new_aligned::<c64>(glwe_size.0 * fourier_polynomial_size, align)?;
-    let fourier_scratch_single = StackReq::try_new_aligned::<c64>(fourier_polynomial_size, align)?;
-
-    let substack3 = fft.forward_scratch()?;
-    let substack2 = substack3.try_and(fourier_scratch_single)?;
-    let substack1 = substack2.try_and(standard_scratch)?;
-    let substack0 = StackReq::try_any_of([
-        substack1.try_and(standard_scratch)?,
-        fft.backward_scratch()?,
-    ])?;
-    substack0.try_and(fourier_scratch)
-}
-
-/// Perform the external product of `ggsw` and `glwe`, and adds the result to `out`.
-#[cfg_attr(__profiling, inline(never))]
-pub fn add_external_product_assign<Scalar, InputGlweCont>(
-    mut out: GlweCiphertextMutView<'_, Scalar>,
-    ggsw: FourierGgswCiphertextView<'_>,
-    glwe: GlweCiphertext<InputGlweCont>,
-    fft: FftView<'_>,
-    stack: PodStack<'_>,
-) where
-    Scalar: UnsignedTorus,
-    InputGlweCont: Container<Element = Scalar>,
-{
-    // we check that the polynomial sizes match
-    debug_assert_eq!(ggsw.polynomial_size(), glwe.polynomial_size());
-    debug_assert_eq!(ggsw.polynomial_size(), out.polynomial_size());
-    // we check that the glwe sizes match
-    debug_assert_eq!(ggsw.glwe_size(), glwe.glwe_size());
-    debug_assert_eq!(ggsw.glwe_size(), out.glwe_size());
-
-    let align = CACHELINE_ALIGN;
-    let fourier_poly_size = ggsw.polynomial_size().to_fourier_polynomial_size().0;
-
-    // we round the input mask and body
-    let decomposer = SignedDecomposer::<Scalar>::new(
-        ggsw.decomposition_base_log(),
-        ggsw.decomposition_level_count(),
-    );
-
-    let (mut output_fft_buffer, mut substack0) =
-        stack.make_aligned_raw::<c64>(fourier_poly_size * ggsw.glwe_size().0, align);
-    // output_fft_buffer is initially uninitialized, considered to be implicitly zero, to avoid
-    // the cost of filling it up with zeros. `is_output_uninit` is set to `false` once
-    // it has been fully initialized for the first time.
-    let output_fft_buffer = &mut *output_fft_buffer;
-    let mut is_output_uninit = true;
-
-    {
-        // ------------------------------------------------------ EXTERNAL PRODUCT IN FOURIER DOMAIN
-        // In this section, we perform the external product in the fourier domain, and accumulate
-        // the result in the output_fft_buffer variable.
-        let (mut decomposition, mut substack1) = TensorSignedDecompositionLendingIter::new(
-            glwe.as_ref()
-                .iter()
-                .map(|s| decomposer.closest_representable(*s)),
-            DecompositionBaseLog(decomposer.base_log),
-            DecompositionLevelCount(decomposer.level_count),
-            substack0.rb_mut(),
-        );
-
-        // We loop through the levels (we reverse to match the order of the decomposition iterator.)
-        ggsw.into_levels().rev().for_each(|ggsw_decomp_matrix| {
-            // We retrieve the decomposition of this level.
-            let (glwe_level, glwe_decomp_term, mut substack2) =
-                collect_next_term(&mut decomposition, &mut substack1, align);
-            let glwe_decomp_term = GlweCiphertextView::from_container(
-                &*glwe_decomp_term,
-                ggsw.polynomial_size(),
-                out.ciphertext_modulus(),
-            );
-            debug_assert_eq!(ggsw_decomp_matrix.decomposition_level(), glwe_level);
-
-            // For each level we have to add the result of the vector-matrix product between the
-            // decomposition of the glwe, and the ggsw level matrix to the output. To do so, we
-            // iteratively add to the output, the product between every line of the matrix, and
-            // the corresponding (scalar) polynomial in the glwe decomposition:
-            //
-            //                ggsw_mat                        ggsw_mat
-            //   glwe_dec   | - - - - | <        glwe_dec   | - - - - |
-            //  | - - - | x | - - - - |         | - - - | x | - - - - | <
-            //    ^         | - - - - |             ^       | - - - - |
-            //
-            //        t = 1                           t = 2                     ...
-
-            izip!(
-                ggsw_decomp_matrix.into_rows(),
-                glwe_decomp_term.as_polynomial_list().iter()
-            )
-            .for_each(|(ggsw_row, glwe_poly)| {
-                let (mut fourier, substack3) = substack2
-                    .rb_mut()
-                    .make_aligned_raw::<c64>(fourier_poly_size, align);
-                // We perform the forward fft transform for the glwe polynomial
-                let fourier = fft
-                    .forward_as_integer(
-                        FourierPolynomialMutView { data: &mut fourier },
-                        glwe_poly,
-                        substack3,
-                    )
-                    .data;
-                // Now we loop through the polynomials of the output, and add the
-                // corresponding product of polynomials.
-
-                update_with_fmadd(
-                    output_fft_buffer,
-                    ggsw_row.data(),
-                    fourier,
-                    is_output_uninit,
-                    fourier_poly_size,
-                );
-
-                // we initialized `output_fft_buffer, so we can set this to false
-                is_output_uninit = false;
-            });
-        });
-    }
-
-    // --------------------------------------------  TRANSFORMATION OF RESULT TO STANDARD DOMAIN
-    // In this section, we bring the result from the fourier domain, back to the standard
-    // domain, and add it to the output.
-    //
-    // We iterate over the polynomials in the output.
-    if !is_output_uninit {
-        izip!(
-            out.as_mut_polynomial_list().iter_mut(),
-            output_fft_buffer
-                .into_chunks(fourier_poly_size)
-                .map(|slice| FourierPolynomialMutView { data: slice }),
-        )
-        .for_each(|(out, fourier)| {
-            // The fourier buffer is not re-used afterwards so we can use the in-place version of
-            // the add_backward_as_torus function
-            fft.add_backward_in_place_as_torus(out, fourier, substack0.rb_mut());
-        });
-    }
-}
-
-#[cfg_attr(__profiling, inline(never))]
-fn collect_next_term<'a, Scalar: UnsignedTorus>(
-    decomposition: &mut TensorSignedDecompositionLendingIter<'_, Scalar>,
-    substack1: &'a mut PodStack,
-    align: usize,
-) -> (
-    DecompositionLevel,
-    dyn_stack::DynArray<'a, Scalar>,
-    PodStack<'a>,
-) {
-    let (glwe_level, _, glwe_decomp_term) = decomposition.next_term().unwrap();
-    let (glwe_decomp_term, substack2) = substack1.rb_mut().collect_aligned(align, glwe_decomp_term);
-    (glwe_level, glwe_decomp_term, substack2)
-}
-
-#[cfg_attr(__profiling, inline(never))]
-pub(crate) fn update_with_fmadd(
-    output_fft_buffer: &mut [c64],
-    lhs_polynomial_list: &[c64],
-    fourier: &[c64],
-    is_output_uninit: bool,
-    fourier_poly_size: usize,
-) {
-    struct Impl<'a> {
-        output_fft_buffer: &'a mut [c64],
-        lhs_polynomial_list: &'a [c64],
-        fourier: &'a [c64],
-        is_output_uninit: bool,
-        fourier_poly_size: usize,
-    }
-
-    impl pulp::WithSimd for Impl<'_> {
-        type Output = ();
-
-        #[inline(always)]
-        fn with_simd<S: pulp::Simd>(self, simd: S) -> Self::Output {
-            // Introducing a function boundary here means that the slices
-            // get `noalias` markers, possibly allowing better optimizations from LLVM.
-            //
-            // see:
-            // https://github.com/rust-lang/rust/blob/56e1aaadb31542b32953292001be2312810e88fd/library/core/src/slice/mod.rs#L960-L966
-            #[inline(always)]
-            fn implementation<S: pulp::Simd>(
-                simd: S,
-                output_fft_buffer: &mut [c64],
-                lhs_polynomial_list: &[c64],
-                fourier: &[c64],
-                is_output_uninit: bool,
-                fourier_poly_size: usize,
-            ) {
-                let rhs = S::c64s_as_simd(fourier).0;
-
-                if is_output_uninit {
-                    for (output_fourier, ggsw_poly) in izip!(
-                        output_fft_buffer.into_chunks(fourier_poly_size),
-                        lhs_polynomial_list.into_chunks(fourier_poly_size)
-                    ) {
-                        let out = S::c64s_as_mut_simd(output_fourier).0;
-                        let lhs = S::c64s_as_simd(ggsw_poly).0;
-
-                        for (out, &lhs, &rhs) in izip!(out, lhs, rhs) {
-                            *out = simd.c64s_mul(lhs, rhs);
-                        }
-                    }
-                } else {
-                    for (output_fourier, ggsw_poly) in izip!(
-                        output_fft_buffer.into_chunks(fourier_poly_size),
-                        lhs_polynomial_list.into_chunks(fourier_poly_size)
-                    ) {
-                        let out = S::c64s_as_mut_simd(output_fourier).0;
-                        let lhs = S::c64s_as_simd(ggsw_poly).0;
-
-                        for (out, &lhs, &rhs) in izip!(out, lhs, rhs) {
-                            *out = simd.c64s_mul_adde(lhs, rhs, *out);
-                        }
-                    }
-                }
-            }
-
-            implementation(
-                simd,
-                self.output_fft_buffer,
-                self.lhs_polynomial_list,
-                self.fourier,
-                self.is_output_uninit,
-                self.fourier_poly_size,
-            )
-        }
-    }
-
-    pulp::Arch::new().dispatch(Impl {
-        output_fft_buffer,
-        lhs_polynomial_list,
-        fourier,
-        is_output_uninit,
-        fourier_poly_size,
-    })
-}
-
-pub(crate) fn update_with_fmadd_factor(
-    output_fft_buffer: &mut [c64],
-    lhs_polynomial_list: &[c64],
-    fourier: &[c64],
-    factor: c64,
-    is_output_uninit: bool,
-    fourier_poly_size: usize,
-) {
-    struct Impl<'a> {
-        output_fft_buffer: &'a mut [c64],
-        lhs_polynomial_list: &'a [c64],
-        fourier: &'a [c64],
-        factor: c64,
-        is_output_uninit: bool,
-        fourier_poly_size: usize,
-    }
-
-    impl pulp::WithSimd for Impl<'_> {
-        type Output = ();
-
-        #[inline(always)]
-        fn with_simd<S: pulp::Simd>(self, simd: S) -> Self::Output {
-            let factor = simd.c64s_splat(self.factor);
-
-            for (output_fourier, ggsw_poly) in izip!(
-                self.output_fft_buffer.into_chunks(self.fourier_poly_size),
-                self.lhs_polynomial_list.into_chunks(self.fourier_poly_size)
-            ) {
-                let out = S::c64s_as_mut_simd(output_fourier).0;
-                let lhs = S::c64s_as_simd(ggsw_poly).0;
-                let rhs = S::c64s_as_simd(self.fourier).0;
-
-                if self.is_output_uninit {
-                    for (out, &lhs, &rhs) in izip!(out, lhs, rhs) {
-                        // NOTE: factor * (lhs * rhs) is more efficient than (lhs * rhs) * factor
-                        *out = simd.c64s_mul(factor, simd.c64s_mul(lhs, rhs));
-                    }
-                } else {
-                    for (out, &lhs, &rhs) in izip!(out, lhs, rhs) {
-                        // NOTE: see above
-                        *out = simd.c64s_mul_adde(factor, simd.c64s_mul(lhs, rhs), *out);
-                    }
-                }
-            }
-        }
-    }
-
-    pulp::Arch::new().dispatch(Impl {
-        output_fft_buffer,
-        lhs_polynomial_list,
-        fourier,
-        factor,
-        is_output_uninit,
-        fourier_poly_size,
-    })
-}
-
-/// Return the required memory for [`cmux`].
-pub fn cmux_scratch<Scalar>(
-    glwe_size: GlweSize,
-    polynomial_size: PolynomialSize,
-    fft: FftView<'_>,
-) -> Result<StackReq, SizeOverflow> {
-    add_external_product_assign_scratch::<Scalar>(glwe_size, polynomial_size, fft)
-}
-
-/// This cmux mutates both ct1 and ct0. The result is in ct0 after the method was called.
-pub fn cmux<Scalar: UnsignedTorus>(
-    ct0: GlweCiphertextMutView<'_, Scalar>,
-    mut ct1: GlweCiphertextMutView<'_, Scalar>,
-    ggsw: FourierGgswCiphertextView<'_>,
-    fft: FftView<'_>,
-    stack: PodStack<'_>,
-) {
-    izip!(ct1.as_mut(), ct0.as_ref(),).for_each(|(c1, c0)| {
-        *c1 = c1.wrapping_sub(*c0);
-    });
-    add_external_product_assign(ct0, ggsw, ct1, fft, stack);
-}
--- a/tfhe/src/core_crypto/fft_impl/fft64/crypto/mod.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/crypto/mod.rs
@@ -1,6 +0,0 @@
-pub mod bootstrap;
-pub mod ggsw;
-pub mod wop_pbs;
-
-#[cfg(test)]
-pub mod tests;
--- a/tfhe/src/core_crypto/fft_impl/fft64/crypto/tests.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/crypto/tests.rs
@@ -1,19 +0,0 @@
-use crate::core_crypto::fft_impl::common::tests::test_bootstrap_generic;
-use crate::core_crypto::fft_impl::fft64::crypto::bootstrap::FourierLweBootstrapKeyOwned;
-use crate::core_crypto::prelude::*;
-
-#[test]
-fn test_bootstrap_u64() {
-    test_bootstrap_generic::<u64, FourierLweBootstrapKeyOwned>(
-        StandardDev(0.000007069849454709433),
-        StandardDev(0.00000000000000029403601535432533),
-    );
-}
-
-#[test]
-fn test_bootstrap_u32() {
-    test_bootstrap_generic::<u32, FourierLweBootstrapKeyOwned>(
-        StandardDev(0.000007069849454709433),
-        StandardDev(0.00000000000000029403601535432533),
-    );
-}
--- a/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/mod.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/mod.rs
@@ -1,895 +0,0 @@
-#![allow(clippy::too_many_arguments)]
-
-use aligned_vec::CACHELINE_ALIGN;
-use dyn_stack::{PodStack, ReborrowMut, SizeOverflow, StackReq};
-
-use super::super::math::fft::FftView;
-use super::bootstrap::{bootstrap_scratch, FourierLweBootstrapKeyView};
-use super::ggsw::{
-    add_external_product_assign, add_external_product_assign_scratch, cmux, cmux_scratch,
-    fill_with_forward_fourier_scratch, FourierGgswCiphertextListMutView,
-    FourierGgswCiphertextListView,
-};
-use crate::core_crypto::algorithms::polynomial_algorithms::*;
-use crate::core_crypto::algorithms::*;
-use crate::core_crypto::commons::math::decomposition::DecompositionLevel;
-use crate::core_crypto::commons::numeric::CastInto;
-use crate::core_crypto::commons::parameters::*;
-use crate::core_crypto::commons::traits::*;
-use crate::core_crypto::commons::utils::izip;
-use crate::core_crypto::entities::*;
-
-use concrete_fft::c64;
-
-pub fn extract_bits_scratch<Scalar>(
-    input_lwe_dimension: LweDimension,
-    ksk_after_key_size: LweDimension,
-    glwe_size: GlweSize,
-    polynomial_size: PolynomialSize,
-    fft: FftView<'_>,
-) -> Result<StackReq, SizeOverflow> {
-    let align = CACHELINE_ALIGN;
-
-    let lwe_in_buffer =
-        StackReq::try_new_aligned::<Scalar>(input_lwe_dimension.to_lwe_size().0, align)?;
-    let lwe_out_ks_buffer =
-        StackReq::try_new_aligned::<Scalar>(ksk_after_key_size.to_lwe_size().0, align)?;
-    let pbs_accumulator =
-        StackReq::try_new_aligned::<Scalar>(glwe_size.0 * polynomial_size.0, align)?;
-    let lwe_out_pbs_buffer = StackReq::try_new_aligned::<Scalar>(
-        glwe_size
-            .to_glwe_dimension()
-            .to_equivalent_lwe_dimension(polynomial_size)
-            .to_lwe_size()
-            .0,
-        align,
-    )?;
-    let lwe_bit_left_shift_buffer = lwe_in_buffer;
-    let bootstrap_scratch = bootstrap_scratch::<Scalar>(glwe_size, polynomial_size, fft)?;
-
-    lwe_in_buffer
-        .try_and(lwe_out_ks_buffer)?
-        .try_and(pbs_accumulator)?
-        .try_and(lwe_out_pbs_buffer)?
-        .try_and(StackReq::try_any_of([
-            lwe_bit_left_shift_buffer,
-            bootstrap_scratch,
-        ])?)
-}
-
-/// Function to extract `number_of_bits_to_extract` from an [`LweCiphertext`] starting at the bit
-/// number `delta_log` (0-indexed) included.
-///
-/// Output bits are ordered from the MSB to the LSB. Each one of them is output in a distinct LWE
-/// ciphertext, containing the encryption of the bit scaled by q/2 (i.e., the most significant bit
-/// in the plaintext representation).
-pub fn extract_bits<Scalar: UnsignedTorus + CastInto<usize>>(
-    mut lwe_list_out: LweCiphertextList<&'_ mut [Scalar]>,
-    lwe_in: LweCiphertext<&'_ [Scalar]>,
-    ksk: LweKeyswitchKey<&'_ [Scalar]>,
-    fourier_bsk: FourierLweBootstrapKeyView<'_>,
-    delta_log: DeltaLog,
-    number_of_bits_to_extract: ExtractedBitsCount,
-    fft: FftView<'_>,
-    stack: PodStack<'_>,
-) {
-    debug_assert!(lwe_list_out.ciphertext_modulus() == lwe_in.ciphertext_modulus());
-    debug_assert!(lwe_in.ciphertext_modulus() == ksk.ciphertext_modulus());
-    debug_assert!(
-        ksk.ciphertext_modulus().is_native_modulus(),
-        "This operation only supports native moduli"
-    );
-
-    let ciphertext_n_bits = Scalar::BITS;
-    let number_of_bits_to_extract = number_of_bits_to_extract.0;
-
-    debug_assert!(
-        ciphertext_n_bits >= number_of_bits_to_extract + delta_log.0,
-        "Tried to extract {} bits, while the maximum number of extractable bits for {} bits
-        ciphertexts and a scaling factor of 2^{} is {}",
-        number_of_bits_to_extract,
-        ciphertext_n_bits,
-        delta_log.0,
-        ciphertext_n_bits - delta_log.0,
-    );
-    debug_assert!(
-        lwe_list_out.lwe_size().to_lwe_dimension() == ksk.output_key_lwe_dimension(),
-        "lwe_list_out needs to have an lwe_size of {}, got {}",
-        ksk.output_key_lwe_dimension().0,
-        lwe_list_out.lwe_size().to_lwe_dimension().0,
-    );
-    debug_assert!(
-        lwe_list_out.lwe_ciphertext_count().0 == number_of_bits_to_extract,
-        "lwe_list_out needs to have a ciphertext count of {}, got {}",
-        number_of_bits_to_extract,
-        lwe_list_out.lwe_ciphertext_count().0,
-    );
-    debug_assert!(
-        lwe_in.lwe_size() == fourier_bsk.output_lwe_dimension().to_lwe_size(),
-        "lwe_in needs to have an LWE dimension of {}, got {}",
-        fourier_bsk.output_lwe_dimension().to_lwe_size().0,
-        lwe_in.lwe_size().0,
-    );
-    debug_assert!(
-        ksk.output_key_lwe_dimension() == fourier_bsk.input_lwe_dimension(),
-        "ksk needs to have an output LWE dimension of {}, got {}",
-        fourier_bsk.input_lwe_dimension().0,
-        ksk.output_key_lwe_dimension().0,
-    );
-    debug_assert!(lwe_list_out.ciphertext_modulus() == lwe_in.ciphertext_modulus());
-    debug_assert!(lwe_in.ciphertext_modulus() == ksk.ciphertext_modulus());
-
-    let polynomial_size = fourier_bsk.polynomial_size();
-    let glwe_size = fourier_bsk.glwe_size();
-    let glwe_dimension = glwe_size.to_glwe_dimension();
-    let ciphertext_modulus = lwe_in.ciphertext_modulus();
-
-    let align = CACHELINE_ALIGN;
-
-    let (mut lwe_in_buffer_data, stack) =
-        stack.collect_aligned(align, lwe_in.as_ref().iter().copied());
-    let mut lwe_in_buffer =
-        LweCiphertext::from_container(&mut *lwe_in_buffer_data, lwe_in.ciphertext_modulus());
-
-    let (mut lwe_out_ks_buffer_data, stack) =
-        stack.make_aligned_with(ksk.output_lwe_size().0, align, |_| Scalar::ZERO);
-    let mut lwe_out_ks_buffer =
-        LweCiphertext::from_container(&mut *lwe_out_ks_buffer_data, ksk.ciphertext_modulus());
-
-    let (mut pbs_accumulator_data, stack) =
-        stack.make_aligned_with(glwe_size.0 * polynomial_size.0, align, |_| Scalar::ZERO);
-    let mut pbs_accumulator = GlweCiphertextMutView::from_container(
-        &mut *pbs_accumulator_data,
-        polynomial_size,
-        ciphertext_modulus,
-    );
-
-    let lwe_size = glwe_dimension
-        .to_equivalent_lwe_dimension(polynomial_size)
-        .to_lwe_size();
-    let (mut lwe_out_pbs_buffer_data, mut stack) =
-        stack.make_aligned_with(lwe_size.0, align, |_| Scalar::ZERO);
-    let mut lwe_out_pbs_buffer = LweCiphertext::from_container(
-        &mut *lwe_out_pbs_buffer_data,
-        lwe_list_out.ciphertext_modulus(),
-    );
-
-    // We iterate on the list in reverse as we want to store the extracted MSB at index 0
-    for (bit_idx, mut output_ct) in lwe_list_out.iter_mut().rev().enumerate() {
-        // Shift on padding bit
-        let (lwe_bit_left_shift_buffer_data, _) = stack.rb_mut().collect_aligned(
-            align,
-            lwe_in_buffer
-                .as_ref()
-                .iter()
-                .map(|s| *s << (ciphertext_n_bits - delta_log.0 - bit_idx - 1)),
-        );
-
-        // Key switch to input PBS key
-        keyswitch_lwe_ciphertext(
-            &ksk,
-            &LweCiphertext::from_container(
-                &*lwe_bit_left_shift_buffer_data,
-                lwe_in.ciphertext_modulus(),
-            ),
-            &mut lwe_out_ks_buffer,
-        );
-
-        drop(lwe_bit_left_shift_buffer_data);
-
-        // Store the keyswitch output unmodified to the output list (as we need to to do other
-        // computations on the output of the keyswitch)
-        output_ct
-            .as_mut()
-            .copy_from_slice(lwe_out_ks_buffer.as_ref());
-
-        // If this was the last extracted bit, break
-        // we subtract 1 because if the number_of_bits_to_extract is 1 we want to stop right away
-        if bit_idx == number_of_bits_to_extract - 1 {
-            break;
-        }
-
-        // Add q/4 to center the error while computing a negacyclic LUT
-        let out_ks_body = lwe_out_ks_buffer.get_mut_body().data;
-        *out_ks_body = (*out_ks_body).wrapping_add(Scalar::ONE << (ciphertext_n_bits - 2));
-
-        // Fill lut for the current bit (equivalent to trivial encryption as mask is 0s)
-        // The LUT is filled with -alpha in each coefficient where alpha = delta*2^{bit_idx-1}
-        for poly_coeff in &mut pbs_accumulator
-            .as_mut_view()
-            .get_mut_body()
-            .as_mut_polynomial()
-            .iter_mut()
-        {
-            *poly_coeff = Scalar::ZERO.wrapping_sub(Scalar::ONE << (delta_log.0 - 1 + bit_idx));
-        }
-
-        fourier_bsk.bootstrap(
-            lwe_out_pbs_buffer.as_mut_view(),
-            lwe_out_ks_buffer.as_view(),
-            pbs_accumulator.as_view(),
-            fft,
-            stack.rb_mut(),
-        );
-
-        // Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0 if the
-        // extracted bit was 0 and 1 in the other case
-        let out_pbs_body = lwe_out_pbs_buffer.get_mut_body().data;
-
-        *out_pbs_body = (*out_pbs_body).wrapping_add(Scalar::ONE << (delta_log.0 + bit_idx - 1));
-
-        // Remove the extracted bit from the initial LWE to get a 0 at the extracted bit location.
-        izip!(lwe_in_buffer.as_mut(), lwe_out_pbs_buffer.as_ref())
-            .for_each(|(out, inp)| *out = (*out).wrapping_sub(*inp));
-    }
-}
-
-pub fn circuit_bootstrap_boolean_scratch<Scalar>(
-    lwe_in_size: LweSize,
-    bsk_output_lwe_size: LweSize,
-    glwe_size: GlweSize,
-    polynomial_size: PolynomialSize,
-    fft: FftView<'_>,
-) -> Result<StackReq, SizeOverflow> {
-    StackReq::try_new_aligned::<Scalar>(bsk_output_lwe_size.0, CACHELINE_ALIGN)?.try_and(
-        homomorphic_shift_boolean_scratch::<Scalar>(lwe_in_size, glwe_size, polynomial_size, fft)?,
-    )
-}
-
-/// Circuit bootstrapping for boolean messages, i.e. containing only one bit of message
-///
-/// The output GGSW ciphertext `ggsw_out` decomposition base log and level count are used as the
-/// circuit_bootstrap_boolean decomposition base log and level count.
-pub fn circuit_bootstrap_boolean<Scalar: UnsignedTorus + CastInto<usize>>(
-    fourier_bsk: FourierLweBootstrapKeyView<'_>,
-    lwe_in: LweCiphertext<&[Scalar]>,
-    mut ggsw_out: GgswCiphertext<&mut [Scalar]>,
-    delta_log: DeltaLog,
-    pfpksk_list: LwePrivateFunctionalPackingKeyswitchKeyList<&[Scalar]>,
-    fft: FftView<'_>,
-    stack: PodStack<'_>,
-) {
-    debug_assert!(lwe_in.ciphertext_modulus() == ggsw_out.ciphertext_modulus());
-    debug_assert!(ggsw_out.ciphertext_modulus() == pfpksk_list.ciphertext_modulus());
-
-    debug_assert!(
-        pfpksk_list.ciphertext_modulus().is_native_modulus(),
-        "This operation currently only supports native moduli"
-    );
-
-    let level_cbs = ggsw_out.decomposition_level_count();
-    let base_log_cbs = ggsw_out.decomposition_base_log();
-
-    debug_assert!(
-        level_cbs.0 >= 1,
-        "level_cbs needs to be >= 1, got {}",
-        level_cbs.0
-    );
-    debug_assert!(
-        base_log_cbs.0 >= 1,
-        "base_log_cbs needs to be >= 1, got {}",
-        base_log_cbs.0
-    );
-
-    let fpksk_input_lwe_key_dimension = pfpksk_list.input_key_lwe_dimension();
-    let fourier_bsk_output_lwe_dimension = fourier_bsk.output_lwe_dimension();
-
-    debug_assert!(
-        fpksk_input_lwe_key_dimension == fourier_bsk_output_lwe_dimension,
-        "The fourier_bsk output_lwe_dimension, got {}, must be equal to the fpksk \
-        input_lwe_key_dimension, got {}",
-        fourier_bsk_output_lwe_dimension.0,
-        fpksk_input_lwe_key_dimension.0
-    );
-
-    let fpksk_output_polynomial_size = pfpksk_list.output_polynomial_size();
-    let fpksk_output_glwe_key_dimension = pfpksk_list.output_key_glwe_dimension();
-
-    debug_assert!(
-        ggsw_out.polynomial_size() == fpksk_output_polynomial_size,
-        "The output GGSW ciphertext needs to have the same polynomial size as the fpksks, \
-        got {}, expected {}",
-        ggsw_out.polynomial_size().0,
-        fpksk_output_polynomial_size.0
-    );
-
-    debug_assert!(
-        ggsw_out.glwe_size().to_glwe_dimension() == fpksk_output_glwe_key_dimension,
-        "The output GGSW ciphertext needs to have the same GLWE dimension as the fpksks, \
-        got {}, expected {}",
-        ggsw_out.glwe_size().to_glwe_dimension().0,
-        fpksk_output_glwe_key_dimension.0
-    );
-
-    debug_assert!(
-        ggsw_out.glwe_size().0 == pfpksk_list.lwe_pfpksk_count().0,
-        "The input vector of pfpksk_list needs to have {} ggsw.glwe_size elements got {}",
-        ggsw_out.glwe_size().0,
-        pfpksk_list.lwe_pfpksk_count().0,
-    );
-
-    // Output for every bootstrapping
-    let (mut lwe_out_bs_buffer_data, mut stack) = stack.make_aligned_with(
-        fourier_bsk_output_lwe_dimension.to_lwe_size().0,
-        CACHELINE_ALIGN,
-        |_| Scalar::ZERO,
-    );
-    let mut lwe_out_bs_buffer =
-        LweCiphertext::from_container(&mut *lwe_out_bs_buffer_data, lwe_in.ciphertext_modulus());
-
-    for (decomposition_level_minus_one, mut ggsw_level_matrix) in ggsw_out.iter_mut().enumerate() {
-        let decomposition_level = DecompositionLevel(decomposition_level_minus_one + 1);
-        homomorphic_shift_boolean(
-            fourier_bsk,
-            lwe_out_bs_buffer.as_mut_view(),
-            lwe_in.as_view(),
-            decomposition_level,
-            base_log_cbs,
-            delta_log,
-            fft,
-            stack.rb_mut(),
-        );
-
-        for (pfpksk, mut glwe_out) in pfpksk_list
-            .iter()
-            .zip(ggsw_level_matrix.as_mut_glwe_list().iter_mut())
-        {
-            private_functional_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
-                &pfpksk,
-                &mut glwe_out,
-                &lwe_out_bs_buffer,
-            );
-        }
-    }
-}
-
-pub fn homomorphic_shift_boolean_scratch<Scalar>(
-    lwe_in_size: LweSize,
-    glwe_size: GlweSize,
-    polynomial_size: PolynomialSize,
-    fft: FftView<'_>,
-) -> Result<StackReq, SizeOverflow> {
-    let align = CACHELINE_ALIGN;
-    StackReq::try_new_aligned::<Scalar>(lwe_in_size.0, align)?
-        .try_and(StackReq::try_new_aligned::<Scalar>(
-            polynomial_size.0 * glwe_size.0,
-            align,
-        )?)?
-        .try_and(bootstrap_scratch::<Scalar>(
-            glwe_size,
-            polynomial_size,
-            fft,
-        )?)
-}
-
-/// Homomorphic shift for LWE without padding bit
-///
-/// Starts by shifting the message bit at bit #delta_log to the padding bit and then shifts it to
-/// the right by base_log * level.
-pub fn homomorphic_shift_boolean<Scalar: UnsignedTorus + CastInto<usize>>(
-    fourier_bsk: FourierLweBootstrapKeyView<'_>,
-    mut lwe_out: LweCiphertext<&mut [Scalar]>,
-    lwe_in: LweCiphertext<&[Scalar]>,
-    level_count_cbs: DecompositionLevel,
-    base_log_cbs: DecompositionBaseLog,
-    delta_log: DeltaLog,
-    fft: FftView<'_>,
-    stack: PodStack<'_>,
-) {
-    debug_assert!(lwe_out.ciphertext_modulus() == lwe_in.ciphertext_modulus());
-    debug_assert!(
-        lwe_in.ciphertext_modulus().is_native_modulus(),
-        "This operation currently only supports native moduli"
-    );
-
-    let ciphertext_n_bits = Scalar::BITS;
-    let lwe_in_size = lwe_in.lwe_size();
-    let polynomial_size = fourier_bsk.polynomial_size();
-    let ciphertext_moudulus = lwe_out.ciphertext_modulus();
-
-    let (mut lwe_left_shift_buffer_data, stack) =
-        stack.make_aligned_with(lwe_in_size.0, CACHELINE_ALIGN, |_| Scalar::ZERO);
-    let mut lwe_left_shift_buffer = LweCiphertext::from_container(
-        &mut *lwe_left_shift_buffer_data,
-        lwe_in.ciphertext_modulus(),
-    );
-    // Shift message LSB on padding bit, at this point we expect to have messages with only 1 bit
-    // of information
-    lwe_ciphertext_cleartext_mul(
-        &mut lwe_left_shift_buffer,
-        &lwe_in,
-        Cleartext(Scalar::ONE << (ciphertext_n_bits - delta_log.0 - 1)),
-    );
-
-    // Add q/4 to center the error while computing a negacyclic LUT
-    let shift_buffer_body = lwe_left_shift_buffer.get_mut_body();
-    *shift_buffer_body.data =
-        (*shift_buffer_body.data).wrapping_add(Scalar::ONE << (ciphertext_n_bits - 2));
-
-    let (mut pbs_accumulator_data, stack) = stack.make_aligned_with(
-        polynomial_size.0 * fourier_bsk.glwe_size().0,
-        CACHELINE_ALIGN,
-        |_| Scalar::ZERO,
-    );
-    let mut pbs_accumulator = GlweCiphertextMutView::from_container(
-        &mut *pbs_accumulator_data,
-        polynomial_size,
-        ciphertext_moudulus,
-    );
-
-    // Fill lut (equivalent to trivial encryption as mask is 0s)
-    // The LUT is filled with -alpha in each coefficient where
-    // alpha = 2^{log(q) - 1 - base_log * level}
-    pbs_accumulator.get_mut_body().as_mut().fill(
-        Scalar::ZERO.wrapping_sub(
-            Scalar::ONE << (ciphertext_n_bits - 1 - base_log_cbs.0 * level_count_cbs.0),
-        ),
-    );
-
-    // Applying a negacyclic LUT on a ciphertext with one bit of message in the MSB and no bit
-    // of padding
-    fourier_bsk.bootstrap(
-        lwe_out.as_mut_view(),
-        lwe_left_shift_buffer.as_view(),
-        pbs_accumulator.as_view(),
-        fft,
-        stack,
-    );
-
-    // Add alpha where alpha = 2^{log(q) - 1 - base_log * level}
-    // To end up with an encryption of 0 if the message bit was 0 and 1 in the other case
-    let out_body = lwe_out.get_mut_body();
-    *out_body.data = (*out_body.data)
-        .wrapping_add(Scalar::ONE << (ciphertext_n_bits - 1 - base_log_cbs.0 * level_count_cbs.0));
-}
-
-pub fn cmux_tree_memory_optimized_scratch<Scalar>(
-    glwe_size: GlweSize,
-    polynomial_size: PolynomialSize,
-    nb_layer: usize,
-    fft: FftView<'_>,
-) -> Result<StackReq, SizeOverflow> {
-    let t_scratch = StackReq::try_new_aligned::<Scalar>(
-        polynomial_size.0 * glwe_size.0 * nb_layer,
-        CACHELINE_ALIGN,
-    )?;
-
-    StackReq::try_all_of([
-        t_scratch,                             // t_0
-        t_scratch,                             // t_1
-        StackReq::try_new::<usize>(nb_layer)?, // t_fill
-        t_scratch,                             // diff
-        add_external_product_assign_scratch::<Scalar>(glwe_size, polynomial_size, fft)?,
-    ])
-}
-
-/// Perform a tree of cmux in a way that limits the total allocated memory to avoid issues for
-/// bigger trees.
-pub fn cmux_tree_memory_optimized<Scalar: UnsignedTorus + CastInto<usize>>(
-    mut output_glwe: GlweCiphertext<&mut [Scalar]>,
-    lut_per_layer: PolynomialList<&[Scalar]>,
-    ggsw_list: FourierGgswCiphertextListView<'_>,
-    fft: FftView<'_>,
-    stack: PodStack<'_>,
-) {
-    debug_assert!(lut_per_layer.polynomial_count().0 == 1 << ggsw_list.count());
-
-    if ggsw_list.count() > 0 {
-        let glwe_size = output_glwe.glwe_size();
-        let ciphertext_modulus = output_glwe.ciphertext_modulus();
-        let polynomial_size = ggsw_list.polynomial_size();
-        let nb_layer = ggsw_list.count();
-
-        debug_assert!(stack.can_hold(
-            cmux_tree_memory_optimized_scratch::<Scalar>(glwe_size, polynomial_size, nb_layer, fft)
-                .unwrap()
-        ));
-
-        // These are accumulator that will be used to propagate the result from layer to layer
-        // At index 0 you have the lut that will be loaded, and then the result for each layer gets
-        // computed at the next index, last layer result gets stored in `result`.
-        // This allow to use memory space in C * nb_layer instead of C' * 2 ^ nb_layer
-        let (mut t_0_data, stack) = stack.make_aligned_with(
-            polynomial_size.0 * glwe_size.0 * nb_layer,
-            CACHELINE_ALIGN,
-            |_| Scalar::ZERO,
-        );
-        let (mut t_1_data, stack) = stack.make_aligned_with(
-            polynomial_size.0 * glwe_size.0 * nb_layer,
-            CACHELINE_ALIGN,
-            |_| Scalar::ZERO,
-        );
-
-        let mut t_0 = GlweCiphertextList::from_container(
-            t_0_data.as_mut(),
-            glwe_size,
-            polynomial_size,
-            ciphertext_modulus,
-        );
-        let mut t_1 = GlweCiphertextList::from_container(
-            t_1_data.as_mut(),
-            glwe_size,
-            polynomial_size,
-            ciphertext_modulus,
-        );
-
-        let (mut t_fill, mut stack) = stack.make_with(nb_layer, |_| 0_usize);
-
-        let mut lut_polynomial_iter = lut_per_layer.iter();
-        loop {
-            let even = lut_polynomial_iter.next();
-            let odd = lut_polynomial_iter.next();
-
-            let (lut_2i, lut_2i_plus_1) = match (even, odd) {
-                (Some(even), Some(odd)) => (even, odd),
-                _ => break,
-            };
-
-            let mut t_iter = izip!(t_0.iter_mut(), t_1.iter_mut(),).enumerate();
-
-            let (mut j_counter, (mut t0_j, mut t1_j)) = t_iter.next().unwrap();
-
-            t0_j.get_mut_body()
-                .as_mut()
-                .copy_from_slice(lut_2i.as_ref());
-
-            t1_j.get_mut_body()
-                .as_mut()
-                .copy_from_slice(lut_2i_plus_1.as_ref());
-
-            t_fill[0] = 2;
-
-            for (j, ggsw) in ggsw_list.into_ggsw_iter().rev().enumerate() {
-                if t_fill[j] == 2 {
-                    let (diff_data, stack) = stack.rb_mut().collect_aligned(
-                        CACHELINE_ALIGN,
-                        izip!(t1_j.as_ref(), t0_j.as_ref()).map(|(&a, &b)| a.wrapping_sub(b)),
-                    );
-                    let diff = GlweCiphertext::from_container(
-                        &*diff_data,
-                        polynomial_size,
-                        ciphertext_modulus,
-                    );
-
-                    if j != nb_layer - 1 {
-                        let (j_counter_plus_1, (mut t_0_j_plus_1, mut t_1_j_plus_1)) =
-                            t_iter.next().unwrap();
-
-                        assert_eq!(j_counter, j);
-                        assert_eq!(j_counter_plus_1, j + 1);
-
-                        let mut output = if t_fill[j + 1] == 0 {
-                            t_0_j_plus_1.as_mut_view()
-                        } else {
-                            t_1_j_plus_1.as_mut_view()
-                        };
-
-                        output.as_mut().copy_from_slice(t0_j.as_ref());
-                        add_external_product_assign(output, ggsw, diff, fft, stack);
-                        t_fill[j + 1] += 1;
-                        t_fill[j] = 0;
-
-                        drop(diff_data);
-
-                        (j_counter, t0_j, t1_j) = (j_counter_plus_1, t_0_j_plus_1, t_1_j_plus_1);
-                    } else {
-                        let mut output = output_glwe.as_mut_view();
-                        output.as_mut().copy_from_slice(t0_j.as_ref());
-                        add_external_product_assign(output, ggsw, diff, fft, stack);
-                    }
-                } else {
-                    break;
-                }
-            }
-        }
-    } else {
-        output_glwe.get_mut_mask().as_mut().fill(Scalar::ZERO);
-        output_glwe
-            .get_mut_body()
-            .as_mut()
-            .copy_from_slice(lut_per_layer.as_ref());
-    }
-}
-
-pub fn circuit_bootstrap_boolean_vertical_packing_scratch<Scalar>(
-    lwe_list_in_count: LweCiphertextCount,
-    lwe_list_out_count: LweCiphertextCount,
-    lwe_in_size: LweSize,
-    big_lut_polynomial_count: PolynomialCount,
-    bsk_output_lwe_size: LweSize,
-    glwe_size: GlweSize,
-    fpksk_output_polynomial_size: PolynomialSize,
-    level_cbs: DecompositionLevelCount,
-    fft: FftView<'_>,
-) -> Result<StackReq, SizeOverflow> {
-    // We deduce the number of luts in the vec_lut from the number of cipherxtexts in lwe_list_out
-    let number_of_luts = lwe_list_out_count.0;
-    let small_lut_size = PolynomialCount(big_lut_polynomial_count.0 / number_of_luts);
-
-    StackReq::try_all_of([
-        StackReq::try_new_aligned::<c64>(
-            lwe_list_in_count.0 * fpksk_output_polynomial_size.0 / 2
-                * glwe_size.0
-                * glwe_size.0
-                * level_cbs.0,
-            CACHELINE_ALIGN,
-        )?,
-        StackReq::try_new_aligned::<Scalar>(
-            fpksk_output_polynomial_size.0 * glwe_size.0 * glwe_size.0 * level_cbs.0,
-            CACHELINE_ALIGN,
-        )?,
-        StackReq::try_any_of([
-            circuit_bootstrap_boolean_scratch::<Scalar>(
-                lwe_in_size,
-                bsk_output_lwe_size,
-                glwe_size,
-                fpksk_output_polynomial_size,
-                fft,
-            )?,
-            fill_with_forward_fourier_scratch(fft)?,
-            vertical_packing_scratch::<Scalar>(
-                glwe_size,
-                fpksk_output_polynomial_size,
-                small_lut_size,
-                lwe_list_in_count.0,
-                fft,
-            )?,
-        ])?,
-    ])
-}
-
-/// Perform a circuit bootstrap followed by a vertical packing on ciphertexts encrypting boolean
-/// messages.
-///
-/// The circuit bootstrapping uses the private functional packing key switch.
-///
-/// This is supposed to be used only with boolean (1 bit of message) LWE ciphertexts.
-pub fn circuit_bootstrap_boolean_vertical_packing<Scalar: UnsignedTorus + CastInto<usize>>(
-    big_lut_as_polynomial_list: PolynomialList<&[Scalar]>,
-    fourier_bsk: FourierLweBootstrapKeyView<'_>,
-    mut lwe_list_out: LweCiphertextList<&mut [Scalar]>,
-    lwe_list_in: LweCiphertextList<&[Scalar]>,
-    pfpksk_list: LwePrivateFunctionalPackingKeyswitchKeyList<&[Scalar]>,
-    level_cbs: DecompositionLevelCount,
-    base_log_cbs: DecompositionBaseLog,
-    fft: FftView<'_>,
-    stack: PodStack<'_>,
-) {
-    debug_assert!(stack.can_hold(
-        circuit_bootstrap_boolean_vertical_packing_scratch::<Scalar>(
-            lwe_list_in.lwe_ciphertext_count(),
-            lwe_list_out.lwe_ciphertext_count(),
-            lwe_list_in.lwe_size(),
-            big_lut_as_polynomial_list.polynomial_count(),
-            fourier_bsk.output_lwe_dimension().to_lwe_size(),
-            fourier_bsk.glwe_size(),
-            pfpksk_list.output_polynomial_size(),
-            level_cbs,
-            fft
-        )
-        .unwrap()
-    ));
-    debug_assert!(
-        lwe_list_in.lwe_ciphertext_count().0 != 0,
-        "Got empty `lwe_list_in`"
-    );
-    debug_assert!(
-        lwe_list_out.lwe_size().to_lwe_dimension() == fourier_bsk.output_lwe_dimension(),
-        "Output LWE ciphertext needs to have an LweDimension of {}, got {}",
-        lwe_list_out.lwe_size().to_lwe_dimension().0,
-        fourier_bsk.output_lwe_dimension().0
-    );
-    debug_assert!(lwe_list_out.ciphertext_modulus() == lwe_list_in.ciphertext_modulus());
-    debug_assert!(lwe_list_in.ciphertext_modulus() == pfpksk_list.ciphertext_modulus());
-    debug_assert!(
-        pfpksk_list.ciphertext_modulus().is_native_modulus(),
-        "This operation currently only supports native moduli"
-    );
-
-    let glwe_size = pfpksk_list.output_key_glwe_dimension().to_glwe_size();
-    let (mut ggsw_list_data, stack) = stack.make_aligned_with(
-        lwe_list_in.lwe_ciphertext_count().0 * pfpksk_list.output_polynomial_size().0 / 2
-            * glwe_size.0
-            * glwe_size.0
-            * level_cbs.0,
-        CACHELINE_ALIGN,
-        |_| c64::default(),
-    );
-    let (mut ggsw_res_data, mut stack) = stack.make_aligned_with(
-        pfpksk_list.output_polynomial_size().0 * glwe_size.0 * glwe_size.0 * level_cbs.0,
-        CACHELINE_ALIGN,
-        |_| Scalar::ZERO,
-    );
-
-    let mut ggsw_list = FourierGgswCiphertextListMutView::new(
-        &mut ggsw_list_data,
-        lwe_list_in.lwe_ciphertext_count().0,
-        glwe_size,
-        pfpksk_list.output_polynomial_size(),
-        base_log_cbs,
-        level_cbs,
-    );
-
-    let mut ggsw_res = GgswCiphertext::from_container(
-        &mut *ggsw_res_data,
-        glwe_size,
-        pfpksk_list.output_polynomial_size(),
-        base_log_cbs,
-        pfpksk_list.ciphertext_modulus(),
-    );
-
-    for (lwe_in, ggsw) in izip!(lwe_list_in.iter(), ggsw_list.as_mut_view().into_ggsw_iter(),) {
-        circuit_bootstrap_boolean(
-            fourier_bsk,
-            lwe_in,
-            ggsw_res.as_mut_view(),
-            DeltaLog(Scalar::BITS - 1),
-            pfpksk_list.as_view(),
-            fft,
-            stack.rb_mut(),
-        );
-
-        ggsw.fill_with_forward_fourier(ggsw_res.as_view(), fft, stack.rb_mut());
-    }
-
-    // We deduce the number of luts in the vec_lut from the number of cipherxtexts in lwe_list_out
-    let number_of_luts = lwe_list_out.lwe_ciphertext_count().0;
-
-    let small_lut_size = big_lut_as_polynomial_list.polynomial_count().0 / number_of_luts;
-
-    for (lut, lwe_out) in izip!(
-        big_lut_as_polynomial_list.chunks_exact(small_lut_size),
-        lwe_list_out.iter_mut(),
-    ) {
-        vertical_packing(lut, lwe_out, ggsw_list.as_view(), fft, stack.rb_mut());
-    }
-}
-
-pub fn vertical_packing_scratch<Scalar>(
-    glwe_size: GlweSize,
-    polynomial_size: PolynomialSize,
-    lut_polynomial_count: PolynomialCount,
-    ggsw_list_count: usize,
-    fft: FftView<'_>,
-) -> Result<StackReq, SizeOverflow> {
-    let bits = core::mem::size_of::<Scalar>() * 8;
-
-    // Get the base 2 logarithm (rounded down) of the number of polynomials in the list i.e. if
-    // there is one polynomial, the number will be 0
-    let log_lut_number: usize = bits - 1 - lut_polynomial_count.0.leading_zeros() as usize;
-
-    let log_number_of_luts_for_cmux_tree = if log_lut_number > ggsw_list_count {
-        // this means that we dont have enough GGSW to perform the CMux tree, we can only do the
-        // Blind rotation
-        0
-    } else {
-        log_lut_number
-    };
-
-    StackReq::try_all_of([
-        // cmux_tree_lut_res
-        StackReq::try_new_aligned::<Scalar>(polynomial_size.0 * glwe_size.0, CACHELINE_ALIGN)?,
-        StackReq::try_any_of([
-            blind_rotate_assign_scratch::<Scalar>(glwe_size, polynomial_size, fft)?,
-            cmux_tree_memory_optimized_scratch::<Scalar>(
-                glwe_size,
-                polynomial_size,
-                log_number_of_luts_for_cmux_tree,
-                fft,
-            )?,
-        ])?,
-    ])
-}
-
-// GGSW ciphertexts are stored from the msb (vec_ggsw[0]) to the lsb (vec_ggsw[last])
-pub fn vertical_packing<Scalar: UnsignedTorus + CastInto<usize>>(
-    lut: PolynomialList<&[Scalar]>,
-    mut lwe_out: LweCiphertext<&mut [Scalar]>,
-    ggsw_list: FourierGgswCiphertextListView<'_>,
-    fft: FftView<'_>,
-    stack: PodStack<'_>,
-) {
-    debug_assert!(
-        lwe_out.ciphertext_modulus().is_native_modulus(),
-        "This operation currently only supports native moduli"
-    );
-
-    let polynomial_size = ggsw_list.polynomial_size();
-    let glwe_size = ggsw_list.glwe_size();
-    let glwe_dimension = glwe_size.to_glwe_dimension();
-    let ciphertext_modulus = lwe_out.ciphertext_modulus();
-
-    debug_assert!(
-        lwe_out.lwe_size().to_lwe_dimension()
-            == glwe_dimension.to_equivalent_lwe_dimension(polynomial_size),
-        "Output LWE ciphertext needs to have an LweDimension of {:?}, got {:?}",
-        glwe_dimension.to_equivalent_lwe_dimension(polynomial_size),
-        lwe_out.lwe_size().to_lwe_dimension(),
-    );
-
-    // Get the base 2 logarithm (rounded down) of the number of polynomials in the list i.e. if
-    // there is one polynomial, the number will be 0
-    let log_lut_number: usize =
-        Scalar::BITS - 1 - lut.polynomial_count().0.leading_zeros() as usize;
-
-    let log_number_of_luts_for_cmux_tree = if log_lut_number > ggsw_list.count() {
-        // this means that we dont have enough GGSW to perform the CMux tree, we can only do the
-        // Blind rotation
-        0
-    } else {
-        log_lut_number
-    };
-
-    // split the vec of GGSW in two, the msb GGSW is for the CMux tree and the lsb GGSW is for
-    // the last blind rotation.
-    let (cmux_ggsw, br_ggsw) = ggsw_list.split_at(log_number_of_luts_for_cmux_tree);
-
-    let (mut cmux_tree_lut_res_data, mut stack) =
-        stack.make_aligned_with(polynomial_size.0 * glwe_size.0, CACHELINE_ALIGN, |_| {
-            Scalar::ZERO
-        });
-    let mut cmux_tree_lut_res = GlweCiphertext::from_container(
-        &mut *cmux_tree_lut_res_data,
-        polynomial_size,
-        ciphertext_modulus,
-    );
-
-    cmux_tree_memory_optimized(
-        cmux_tree_lut_res.as_mut_view(),
-        lut,
-        cmux_ggsw,
-        fft,
-        stack.rb_mut(),
-    );
-    blind_rotate_assign(
-        cmux_tree_lut_res.as_mut_view(),
-        br_ggsw,
-        fft,
-        stack.rb_mut(),
-    );
-
-    // sample extract of the RLWE of the Vertical packing
-    extract_lwe_sample_from_glwe_ciphertext(&cmux_tree_lut_res, &mut lwe_out, MonomialDegree(0))
-}
-
-pub fn blind_rotate_assign_scratch<Scalar>(
-    glwe_size: GlweSize,
-    polynomial_size: PolynomialSize,
-    fft: FftView<'_>,
-) -> Result<StackReq, SizeOverflow> {
-    StackReq::try_all_of([
-        StackReq::try_new_aligned::<Scalar>(polynomial_size.0 * glwe_size.0, CACHELINE_ALIGN)?,
-        cmux_scratch::<Scalar>(glwe_size, polynomial_size, fft)?,
-    ])
-}
-
-pub fn blind_rotate_assign<Scalar: UnsignedTorus + CastInto<usize>>(
-    mut lut: GlweCiphertext<&mut [Scalar]>,
-    ggsw_list: FourierGgswCiphertextListView<'_>,
-    fft: FftView<'_>,
-    mut stack: PodStack<'_>,
-) {
-    let mut monomial_degree = MonomialDegree(1);
-
-    for ggsw in ggsw_list.into_ggsw_iter().rev() {
-        let ct_0 = lut.as_mut_view();
-        let (mut ct1_data, stack) = stack
-            .rb_mut()
-            .collect_aligned(CACHELINE_ALIGN, ct_0.as_ref().iter().copied());
-        let mut ct_1 = GlweCiphertext::from_container(
-            &mut *ct1_data,
-            ct_0.polynomial_size(),
-            ct_0.ciphertext_modulus(),
-        );
-        ct_1.as_mut_polynomial_list()
-            .iter_mut()
-            .for_each(|mut poly| {
-                polynomial_wrapping_monic_monomial_div_assign(&mut poly, monomial_degree)
-            });
-        monomial_degree.0 <<= 1;
-        cmux(ct_0, ct_1, ggsw, fft, stack);
-    }
-}
-
-#[cfg(test)]
-mod tests;
--- a/tfhe/src/core_crypto/fft_impl/fft64/math/decomposition.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/math/decomposition.rs
@@ -1,86 +0,0 @@
-pub use crate::core_crypto::commons::math::decomposition::DecompositionLevel;
-use crate::core_crypto::commons::numeric::UnsignedInteger;
-use crate::core_crypto::commons::parameters::{DecompositionBaseLog, DecompositionLevelCount};
-use dyn_stack::{DynArray, PodStack};
-use std::iter::Map;
-use std::slice::IterMut;
-
-// copied from src/commons/math/decomposition/*.rs
-// in order to avoid allocations
-
-pub struct TensorSignedDecompositionLendingIter<'buffers, Scalar: UnsignedInteger> {
-    // The base log of the decomposition
-    base_log: usize,
-    // The current level
-    current_level: usize,
-    // A mask which allows to compute the mod B of a value. For B=2^4, this guy is of the form:
-    // ...0001111
-    mod_b_mask: Scalar,
-    // The internal states of each decomposition
-    states: DynArray<'buffers, Scalar>,
-    // A flag which stores whether the iterator is a fresh one (for the recompose method).
-    fresh: bool,
-}
-
-impl<'buffers, Scalar: UnsignedInteger> TensorSignedDecompositionLendingIter<'buffers, Scalar> {
-    #[inline]
-    pub(crate) fn new(
-        input: impl Iterator<Item = Scalar>,
-        base_log: DecompositionBaseLog,
-        level: DecompositionLevelCount,
-        stack: PodStack<'buffers>,
-    ) -> (Self, PodStack<'buffers>) {
-        let shift = Scalar::BITS - base_log.0 * level.0;
-        let (states, stack) =
-            stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, input.map(|i| i >> shift));
-        (
-            TensorSignedDecompositionLendingIter {
-                base_log: base_log.0,
-                current_level: level.0,
-                mod_b_mask: (Scalar::ONE << base_log.0) - Scalar::ONE,
-                states,
-                fresh: true,
-            },
-            stack,
-        )
-    }
-
-    // inlining this improves perf of external product by about 25%, even in LTO builds
-    #[inline]
-    pub fn next_term<'short>(
-        &'short mut self,
-    ) -> Option<(
-        DecompositionLevel,
-        DecompositionBaseLog,
-        Map<IterMut<'short, Scalar>, impl FnMut(&'short mut Scalar) -> Scalar>,
-    )> {
-        // The iterator is not fresh anymore.
-        self.fresh = false;
-        // We check if the decomposition is over
-        if self.current_level == 0 {
-            return None;
-        }
-        let current_level = self.current_level;
-        let base_log = self.base_log;
-        let mod_b_mask = self.mod_b_mask;
-        self.current_level -= 1;
-
-        Some((
-            DecompositionLevel(current_level),
-            DecompositionBaseLog(self.base_log),
-            self.states
-                .iter_mut()
-                .map(move |state| decompose_one_level(base_log, state, mod_b_mask)),
-        ))
-    }
-}
-
-#[inline]
-fn decompose_one_level<S: UnsignedInteger>(base_log: usize, state: &mut S, mod_b_mask: S) -> S {
-    let res = *state & mod_b_mask;
-    *state >>= base_log;
-    let mut carry = (res.wrapping_sub(S::ONE) | *state) & res;
-    carry >>= base_log - 1;
-    *state += carry;
-    res.wrapping_sub(carry << base_log)
-}
--- a/tfhe/src/core_crypto/fft_impl/fft64/math/mod.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/math/mod.rs
@@ -1,3 +0,0 @@
-pub mod decomposition;
-pub mod fft;
-pub mod polynomial;
--- a/tfhe/src/core_crypto/fft_impl/fft64/math/polynomial.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/math/polynomial.rs
@@ -1,57 +0,0 @@
-use crate::core_crypto::commons::parameters::*;
-use crate::core_crypto::commons::traits::*;
-use aligned_vec::{avec, ABox};
-use concrete_fft::c64;
-
-//--------------------------------------------------------------------------------
-// Structure definitions
-//--------------------------------------------------------------------------------
-
-/// Polynomial in the Fourier domain.
-///
-/// # Note
-///
-/// Polynomials in the Fourier domain have half the size of the corresponding polynomials in
-/// the standard domain.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub struct FourierPolynomial<C: Container> {
-    pub data: C,
-}
-
-pub type FourierPolynomialView<'a> = FourierPolynomial<&'a [c64]>;
-pub type FourierPolynomialMutView<'a> = FourierPolynomial<&'a mut [c64]>;
-
-pub type FourierPolynomialOwned = FourierPolynomial<ABox<[c64]>>;
-
-impl FourierPolynomial<ABox<[c64]>> {
-    pub fn new(polynomial_size: PolynomialSize) -> FourierPolynomial<ABox<[c64]>> {
-        let boxed = avec![
-            c64::default();
-            polynomial_size.to_fourier_polynomial_size().0
-        ]
-        .into_boxed_slice();
-
-        FourierPolynomial { data: boxed }
-    }
-}
-
-impl<C: Container<Element = c64>> FourierPolynomial<C> {
-    pub fn as_view(&self) -> FourierPolynomialView<'_> {
-        FourierPolynomial {
-            data: self.data.as_ref(),
-        }
-    }
-
-    pub fn as_mut_view(&mut self) -> FourierPolynomialMutView<'_>
-    where
-        C: AsMut<[c64]>,
-    {
-        FourierPolynomial {
-            data: self.data.as_mut(),
-        }
-    }
-
-    pub fn polynomial_size(&self) -> PolynomialSize {
-        PolynomialSize(self.data.container_len() * 2)
-    }
-}
--- a/tfhe/src/core_crypto/fft_impl/fft64/mod.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/mod.rs
@@ -1,5 +0,0 @@
-#![doc(hidden)]
-pub use concrete_fft::c64;
-
-pub mod crypto;
-pub mod math;
--- a/tfhe/src/core_crypto/fft_impl/mod.rs
+++ b/tfhe/src/core_crypto/fft_impl/mod.rs
@@ -1,9 +1,4 @@
 pub mod common;

-// TODO REFACTOR
-// For now this module is not refactored, it contains high performance code and will be refactored
-// at a later stage. It is self contained, allowing to put it in its own module in the meantime.
-pub mod fft64;
-
 pub mod fft128;
 mod fft128_u128;
--- a/tfhe/src/core_crypto/prelude.rs
+++ b/tfhe/src/core_crypto/prelude.rs
@@ -10,10 +10,10 @@ pub use super::commons::computation_buffers::ComputationBuffers;
 pub use super::commons::dispersion::*;
 pub use super::commons::generators::{EncryptionRandomGenerator, SecretRandomGenerator};
 pub use super::commons::math::decomposition::SignedDecomposer;
+pub use super::commons::math::fft64::Fft;
 pub use super::commons::math::random::ActivatedRandomGenerator;
 pub use super::commons::parameters::*;
 pub use super::commons::traits::*;
 pub use super::entities::*;
 pub use super::fft_impl::fft128::math::fft::Fft128;
-pub use super::fft_impl::fft64::math::fft::Fft;
 pub use super::seeders::*;
--- a/tfhe/src/shortint/engine/server_side/mod.rs
+++ b/tfhe/src/shortint/engine/server_side/mod.rs
@@ -1,13 +1,12 @@
 use super::ShortintEngine;
 use crate::core_crypto::algorithms::*;
 use crate::core_crypto::commons::ciphertext_modulus::CiphertextModulus;
+use crate::core_crypto::commons::math::fft64::Fft;
 use crate::core_crypto::commons::parameters::{
    DecompositionBaseLog, DecompositionLevelCount, GlweDimension, LweBskGroupingFactor,
    LweDimension, PolynomialSize, ThreadCount,
 };
 use crate::core_crypto::entities::*;
-use crate::core_crypto::fft_impl::fft64::crypto::bootstrap::FourierLweBootstrapKey;
-use crate::core_crypto::fft_impl::fft64::math::fft::Fft;
 use crate::shortint::ciphertext::Degree;
 use crate::shortint::engine::EngineResult;
 use crate::shortint::parameters::{MessageModulus, ShortintKeySwitchingParameters};
--- a/tfhe/src/shortint/engine/wopbs/mod.rs
+++ b/tfhe/src/shortint/engine/wopbs/mod.rs
@@ -1,10 +1,9 @@
 //! # WARNING: this module is experimental.
 use crate::core_crypto::algorithms::*;
+use crate::core_crypto::commons::math::fft64::Fft;
 use crate::core_crypto::commons::parameters::*;
 use crate::core_crypto::commons::traits::*;
 use crate::core_crypto::entities::*;
-use crate::core_crypto::fft_impl::fft64::crypto::bootstrap::FourierLweBootstrapKey;
-use crate::core_crypto::fft_impl::fft64::math::fft::Fft;
 use crate::shortint::ciphertext::Degree;
 use crate::shortint::engine::{EngineResult, ShortintEngine};
 use crate::shortint::server_key::{MaxDegree, ShortintBootstrappingKey};