feat(optimizer): add gpu parameter constraints

2026-02-08 19:44:57 -05:00 · 2022-10-18 14:37:55 +02:00
parent 1f15162b39
commit c5dad8ffdb
20 changed files with 286 additions and 175 deletions
--- a/charts/src/bin/norm2_complexity.rs
+++ b/charts/src/bin/norm2_complexity.rs
@@ -1,5 +1,6 @@
 use charts::{draw, Serie};
 use concrete_optimizer::computing_cost::cpu::CpuComplexity;
+use concrete_optimizer::config;
 use concrete_optimizer::global_parameters::DEFAUT_DOMAINS;
 use concrete_optimizer::optimization::atomic_pattern::{self as optimize_atomic_pattern};
 use concrete_optimizer::optimization::config::{Config, SearchSpace};
@@ -25,6 +26,8 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
    let glwe_dimensions: Vec<_> = (1..=6).collect();
    let internal_lwe_dimensions: Vec<_> = (MIN_LWE_DIM..=MAX_LWE_DIM).step_by(10).collect();

+    let processing_unit = config::ProcessingUnit::Cpu;
+
    let search_space = SearchSpace {
        glwe_log_polynomial_sizes,
        glwe_dimensions,
@@ -41,7 +44,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
        complexity_model: &CpuComplexity::default(),
    };

-    let cache = decomposition::cache(security_level);
+    let cache = decomposition::cache(security_level, processing_unit, None);

    let solutions: Vec<_> = log_norm2s
        .clone()
--- a/charts/src/bin/precision_complexity.rs
+++ b/charts/src/bin/precision_complexity.rs
@@ -1,5 +1,6 @@
 use charts::{draw, Serie};
 use concrete_optimizer::computing_cost::cpu::CpuComplexity;
+use concrete_optimizer::config;
 use concrete_optimizer::global_parameters::DEFAUT_DOMAINS;
 use concrete_optimizer::optimization::atomic_pattern::{self as optimize_atomic_pattern};
 use concrete_optimizer::optimization::config::{Config, SearchSpace};
@@ -18,6 +19,8 @@ pub const MIN_LWE_DIM: u64 = DEFAUT_DOMAINS.free_glwe.glwe_dimension.start as u6
 pub const MAX_LWE_DIM: u64 = DEFAUT_DOMAINS.free_glwe.glwe_dimension.end as u64 - 1;

 fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let processing_unit = config::ProcessingUnit::Cpu;
+
    let sum_size = 4096;
    let p_error = _4_SIGMA;
    let security_level = 128;
@@ -41,7 +44,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
        complexity_model: &CpuComplexity::default(),
    };

-    let cache = decomposition::cache(security_level);
+    let cache = decomposition::cache(security_level, processing_unit, None);

    let solutions: Vec<_> = precisions
        .clone()
--- a/concrete-optimizer-cpp/src/concrete-optimizer.rs
+++ b/concrete-optimizer-cpp/src/concrete-optimizer.rs
@@ -1,4 +1,5 @@
 use concrete_optimizer::computing_cost::cpu::CpuComplexity;
+use concrete_optimizer::config;
 use concrete_optimizer::dag::operator::{
    self, FunctionTable, LevelledComplexity, OperatorIndex, Precision, Shape,
 };
@@ -22,6 +23,8 @@ fn no_dag_solution() -> ffi::DagSolution {
 }

 fn optimize_bootstrap(precision: u64, noise_factor: f64, options: ffi::Options) -> ffi::Solution {
+    let processing_unit = config::ProcessingUnit::Cpu;
+
    let config = Config {
        security_level: options.security_level,
        maximum_acceptable_error_probability: options.maximum_acceptable_error_probability,
@@ -31,7 +34,7 @@ fn optimize_bootstrap(precision: u64, noise_factor: f64, options: ffi::Options)

    let sum_size = 1;

-    let search_space = SearchSpace::default();
+    let search_space = SearchSpace::default(processing_unit);

    let result = concrete_optimizer::optimization::atomic_pattern::optimize_one(
        sum_size,
@@ -39,7 +42,7 @@ fn optimize_bootstrap(precision: u64, noise_factor: f64, options: ffi::Options)
        config,
        noise_factor,
        &search_space,
-        &decomposition::cache(options.security_level),
+        &decomposition::cache(options.security_level, processing_unit, None),
    );
    result
        .best_solution
@@ -199,6 +202,7 @@ impl OperationDag {
    }

    fn optimize_v0(&self, options: ffi::Options) -> ffi::Solution {
+        let processing_unit = config::ProcessingUnit::Cpu;
        let config = Config {
            security_level: options.security_level,
            maximum_acceptable_error_probability: options.maximum_acceptable_error_probability,
@@ -206,13 +210,13 @@ impl OperationDag {
            complexity_model: &CpuComplexity::default(),
        };

-        let search_space = SearchSpace::default();
+        let search_space = SearchSpace::default(processing_unit);

        let result = concrete_optimizer::optimization::dag::solo_key::optimize::optimize(
            &self.0,
            config,
            &search_space,
-            &decomposition::cache(options.security_level),
+            &decomposition::cache(options.security_level, processing_unit, None),
        );
        result
            .best_solution
@@ -220,6 +224,7 @@ impl OperationDag {
    }

    fn optimize(&self, options: ffi::Options) -> ffi::DagSolution {
+        let processing_unit = config::ProcessingUnit::Cpu;
        let config = Config {
            security_level: options.security_level,
            maximum_acceptable_error_probability: options.maximum_acceptable_error_probability,
@@ -227,8 +232,8 @@ impl OperationDag {
            complexity_model: &CpuComplexity::default(),
        };

-        let search_space = SearchSpace::default();
-        let cache = decomposition::cache(options.security_level);
+        let search_space = SearchSpace::default(processing_unit);
+        let cache = decomposition::cache(options.security_level, processing_unit, None);

        let result = concrete_optimizer::optimization::dag::solo_key::optimize_generic::optimize(
            &self.0,
--- a/concrete-optimizer/src/computing_cost/atomic_pattern.rs
+++ b/concrete-optimizer/src/computing_cost/atomic_pattern.rs
@@ -2,6 +2,7 @@ use super::complexity::Complexity;
 use super::complexity_model::ComplexityModel;
 use crate::parameters::AtomicPatternParameters;

+#[allow(dead_code)]
 pub fn atomic_pattern_complexity(
    complexity_model: &dyn ComplexityModel,
    sum_size: u64,
--- a/concrete-optimizer/src/computing_cost/cpu.rs
+++ b/concrete-optimizer/src/computing_cost/cpu.rs
@@ -4,6 +4,7 @@ use super::operators::keyswitch_lwe::KsComplexity;
 use super::operators::{keyswitch_lwe, pbs};
 use crate::parameters::{KeyswitchParameters, LweDimension, PbsParameters};

+#[derive(Clone)]
 pub struct CpuComplexity {
    pub ks_lwe: keyswitch_lwe::KsComplexity,
    pub pbs: pbs::PbsComplexity,
--- a/concrete-optimizer/src/computing_cost/fft.rs
+++ b/concrete-optimizer/src/computing_cost/fft.rs
@@ -1,6 +1,7 @@
 use super::complexity::Complexity;

 /** Standard fft complexity model */
+#[derive(Clone)]
 pub struct AsymptoticWithFactors {
    factor_fft: f64,  // factor applied on asymptotic complexity
    factor_ifft: f64, // factor applied on asymptotic complexity
@@ -32,14 +33,6 @@ impl Default for AsymptoticWithFactors {
 #[cfg(test)]
 pub mod tests {
    use crate::computing_cost::fft;
-    use crate::computing_cost::fft::AsymptoticWithFactors;
-
-    /** Standard fft complexity with X factors*/
-    pub const COST_AWS: AsymptoticWithFactors = AsymptoticWithFactors {
-        // https://github.com/zama-ai/concrete-optimizer/blob/prototype/python/optimizer/noise_formulas/bootstrap.py#L150
-        factor_fft: 0.202_926_951_153_089_17,
-        factor_ifft: 0.407_795_078_512_891,
-    };

    #[test]
    fn golden_python_prototype() {
--- a/concrete-optimizer/src/computing_cost/gpu.rs
+++ b/concrete-optimizer/src/computing_cost/gpu.rs
@@ -4,125 +4,52 @@ use crate::parameters::{KeyswitchParameters, LweDimension, PbsParameters};
 use crate::utils::square;

 #[derive(Clone, Copy)]
-pub struct GpuPbsComplexity {
-    pub w1: f64,
-    pub w2: f64,
-    pub w3: f64,
-    pub w4: f64,
-    pub occupancy: f64,
-}
-
-//https://github.com/zama-ai/concrete-core-internal/issues/91
-impl GpuPbsComplexity {
-    pub fn default_lowlat_u64(occupancy: f64) -> Self {
-        Self {
-            w1: 2_576.105_013_4,
-            w2: -21_631.382_229_52,
-            w3: -86_525.527_535_17,
-            w4: 0.125_472_398_538_904_43,
-            occupancy,
-        }
-    }
+pub enum GpuPbsComplexity {
+    Lowlat,
+    Amortized,
 }

 #[derive(Clone, Copy)]
-pub struct GpuKsComplexity {
-    pub w1: f64,
-    pub w2: f64,
-    pub w3: f64,
-    pub w4: f64,
-    pub occupancy: f64,
-    pub number_of_sm: u64,
-}
-
-// https://github.com/zama-ai/concrete-core-internal/issues/90
-impl GpuKsComplexity {
-    pub fn default_u64(occupancy: f64, number_of_sm: u64) -> Self {
-        Self {
-            w1: 7_959.869_676_54,
-            w2: 3_866.817_732_87,
-            w3: 8_353.484_127_44,
-            w4: 0.125_472_398_538_904_43,
-            occupancy,
-            number_of_sm,
-        }
-    }
-}
+pub struct GpuKsComplexity;

 #[derive(Clone, Copy)]
 pub struct GpuComplexity {
    pub ks: GpuKsComplexity,
    pub pbs: GpuPbsComplexity,
-    pub ncores: u64,
+    pub number_of_sm: u64,
+}
+
+impl GpuComplexity {
+    pub fn default_lowlat_u64(number_of_sm: u64) -> Self {
+        Self {
+            ks: GpuKsComplexity,
+            pbs: GpuPbsComplexity::Lowlat,
+            number_of_sm,
+        }
+    }
+
+    pub fn default_amortized_u64(number_of_sm: u64) -> Self {
+        Self {
+            ks: GpuKsComplexity,
+            pbs: GpuPbsComplexity::Amortized,
+            number_of_sm,
+        }
+    }
 }

 impl ComplexityModel for GpuComplexity {
    #[allow(clippy::let_and_return, non_snake_case)]
-    fn pbs_complexity(&self, params: PbsParameters, _ciphertext_modulus_log: u32) -> Complexity {
-        let GpuPbsComplexity {
-            w1,
-            w2,
-            w3,
-            w4,
-            occupancy,
-        } = self.pbs;
-
-        let n = params.internal_lwe_dimension.0 as f64;
-        let k = params.output_glwe_params.glwe_dimension as f64;
-        let N = (1 << params.output_glwe_params.log2_polynomial_size) as f64;
-
-        let ell = params.br_decomposition_parameter.level as f64;
-
-        let number_of_ct = 1.;
-
-        let number_of_operations = number_of_ct * algorithmic_complexity_pbs(n, k, N, ell);
-
-        let size = std::mem::size_of::<u64>() as f64;
-
-        let pbs_cost = w4 * number_of_operations / (self.ncores as f64 * occupancy)
-            + (w1 * n * (2. + ell * N * square(k + 1.))
-                + 2. * N * ell * (w2 + w3 * square(k + 1.)))
-                * size;
-
-        pbs_cost
+    fn pbs_complexity(&self, _params: PbsParameters, _ciphertext_modulus_log: u32) -> Complexity {
+        todo!()
    }

    #[allow(clippy::let_and_return)]
    fn ks_complexity(
        &self,
-        params: KeyswitchParameters,
-        ciphertext_modulus_log: u32,
+        _params: KeyswitchParameters,
+        _ciphertext_modulus_log: u32,
    ) -> Complexity {
-        let GpuKsComplexity {
-            w1,
-            w2,
-            w3,
-            w4,
-            occupancy,
-            number_of_sm,
-        } = self.ks;
-
-        let na = params.input_lwe_dimension.0 as f64;
-
-        let nb = params.output_lwe_dimension.0 as f64;
-
-        let ell = params.ks_decomposition_parameter.level as f64;
-
-        let number_of_ct = 1.;
-
-        let number_of_operations =
-            number_of_ct * algorithmic_complexity_ks(na, nb, ell, ciphertext_modulus_log as f64);
-
-        let size = std::mem::size_of::<u64>() as f64;
-
-        let ks_cost = w4 * number_of_operations / (self.ncores as f64 * occupancy)
-            + w1 * (number_of_ct * ((na + 1.) + (nb + 1.)) + ell * (nb + 1.) * na) * size
-            + w2 * number_of_ct * nb * size
-            + w3 * (number_of_ct / number_of_ct.min(number_of_sm as f64 * 12.)).ceil()
-                * ((na + 1.) + (nb + 1.))
-            + ell * (nb + 1.) * size;
-
-        ks_cost
+        todo!()
    }

    fn levelled_complexity(
@@ -136,6 +63,7 @@ impl ComplexityModel for GpuComplexity {
 }

 #[allow(non_snake_case)]
+#[allow(dead_code)]
 fn algorithmic_complexity_pbs(n: f64, k: f64, N: f64, ell: f64) -> f64 {
    n * (ell * (k + 1.) * N * (N.log2() + 1.)
        + (k + 1.) * N * (N.log2() + 1.)
@@ -143,6 +71,7 @@ fn algorithmic_complexity_pbs(n: f64, k: f64, N: f64, ell: f64) -> f64 {
 }

 #[allow(non_snake_case)]
+#[allow(dead_code)]
 fn algorithmic_complexity_ks(na: f64, nb: f64, ell: f64, log2_q: f64) -> f64 {
    na * nb * ell * log2_q
 }
--- a/concrete-optimizer/src/computing_cost/mod.rs
+++ b/concrete-optimizer/src/computing_cost/mod.rs
@@ -1,7 +1,7 @@
-pub mod atomic_pattern;
+mod atomic_pattern;
 pub mod complexity;
 pub mod complexity_model;
 pub mod cpu;
-pub mod fft;
+mod fft;
 pub mod gpu;
 pub mod operators;
--- a/concrete-optimizer/src/computing_cost/operators/cmux.rs
+++ b/concrete-optimizer/src/computing_cost/operators/cmux.rs
@@ -3,6 +3,7 @@ use super::super::fft;
 use crate::parameters::CmuxParameters;
 use crate::utils::square;

+#[derive(Clone)]
 pub struct SimpleWithFactors {
    fft: fft::AsymptoticWithFactors,
    blind_rotate_factor: f64,
--- a/concrete-optimizer/src/computing_cost/operators/keyswitch_lwe.rs
+++ b/concrete-optimizer/src/computing_cost/operators/keyswitch_lwe.rs
@@ -1,6 +1,7 @@
 use super::super::complexity::Complexity;
 use crate::parameters::KeyswitchParameters;

+#[derive(Clone)]
 pub struct KsComplexity;

 impl KsComplexity {
--- a/concrete-optimizer/src/computing_cost/operators/mod.rs
+++ b/concrete-optimizer/src/computing_cost/operators/mod.rs
@@ -1,3 +1,3 @@
 pub mod cmux;
-pub mod keyswitch_lwe;
-pub mod pbs;
+pub(super) mod keyswitch_lwe;
+pub(super) mod pbs;
--- a/concrete-optimizer/src/computing_cost/operators/pbs.rs
+++ b/concrete-optimizer/src/computing_cost/operators/pbs.rs
@@ -2,7 +2,7 @@ use super::super::complexity::Complexity;
 use super::cmux;
 use crate::parameters::PbsParameters;

-#[derive(Default)]
+#[derive(Default, Clone)]
 pub struct PbsComplexity {
    pub cmux: cmux::SimpleWithFactors,
 }
--- a/concrete-optimizer/src/config.rs
+++ b/concrete-optimizer/src/config.rs
@@ -0,0 +1,63 @@
+use std::sync::Arc;
+
+use crate::computing_cost::complexity_model::ComplexityModel;
+use crate::computing_cost::cpu::CpuComplexity;
+use crate::computing_cost::gpu::GpuComplexity;
+use crate::optimization::config::{MAX_LOG2_BASE_CPU, MAX_LOG2_BASE_GPU};
+
+#[derive(Clone, Copy)]
+pub enum ProcessingUnit {
+    Cpu,
+    Gpu {
+        pbs_type: GpuPbsType,
+        number_of_sm: u64,
+    },
+}
+
+#[derive(Clone, Copy)]
+pub enum GpuPbsType {
+    Lowlat,
+    Amortized,
+}
+
+impl ProcessingUnit {
+    pub fn max_br_base_log(self) -> u64 {
+        match self {
+            Self::Cpu => MAX_LOG2_BASE_CPU,
+            Self::Gpu { .. } => MAX_LOG2_BASE_GPU,
+        }
+    }
+
+    pub fn ks_to_string(self) -> &'static str {
+        match self {
+            Self::Cpu => "cpu",
+            Self::Gpu { .. } => "gpu",
+        }
+    }
+    pub fn br_to_string(self) -> &'static str {
+        match self {
+            Self::Cpu => "cpu",
+            Self::Gpu {
+                pbs_type: GpuPbsType::Lowlat,
+                ..
+            } => "gpu_lowlat",
+            Self::Gpu {
+                pbs_type: GpuPbsType::Amortized,
+                ..
+            } => "gpu_amortized",
+        }
+    }
+    pub fn complexity_model(self) -> Arc<dyn ComplexityModel> {
+        match self {
+            Self::Cpu => Arc::new(CpuComplexity::default()),
+            Self::Gpu {
+                pbs_type: GpuPbsType::Amortized,
+                number_of_sm,
+            } => Arc::new(GpuComplexity::default_amortized_u64(number_of_sm)),
+            Self::Gpu {
+                pbs_type: GpuPbsType::Lowlat,
+                number_of_sm,
+            } => Arc::new(GpuComplexity::default_lowlat_u64(number_of_sm)),
+        }
+    }
+}
--- a/concrete-optimizer/src/lib.rs
+++ b/concrete-optimizer/src/lib.rs
@@ -19,6 +19,7 @@

 pub mod computing_cost;

+pub mod config;
 pub mod dag;
 pub mod global_parameters;
 pub mod noise_estimator;
--- a/concrete-optimizer/src/optimization/config.rs
+++ b/concrete-optimizer/src/optimization/config.rs
@@ -1,4 +1,6 @@
 use crate::computing_cost::complexity_model::ComplexityModel;
+use crate::config;
+use crate::config::GpuPbsType;
 use crate::global_parameters::DEFAUT_DOMAINS;

 #[derive(Clone, Copy, Debug)]
@@ -23,8 +25,8 @@ pub struct SearchSpace {
    pub internal_lwe_dimensions: Vec<u64>,
 }

-impl Default for SearchSpace {
-    fn default() -> Self {
+impl SearchSpace {
+    pub fn default_cpu() -> Self {
        let glwe_log_polynomial_sizes: Vec<u64> = DEFAUT_DOMAINS
            .glwe_pbs_constrained
            .log2_polynomial_size
@@ -38,4 +40,55 @@ impl Default for SearchSpace {
            internal_lwe_dimensions,
        }
    }
+
+    pub fn default_gpu_lowlat() -> Self {
+        // https://github.com/zama-ai/concrete-core/blob/6b52182ab44c4b39ddebca1c457e1096fb687801/concrete-cuda/cuda/src/bootstrap_low_latency.cu#L156
+        let glwe_log_polynomial_sizes: Vec<u64> = (9..=11).collect();
+
+        // https://github.com/zama-ai/concrete-core/blob/6b52182ab44c4b39ddebca1c457e1096fb687801/concrete-cuda/cuda/src/bootstrap_low_latency.cu#L154
+        let glwe_dimensions: Vec<u64> = vec![1];
+
+        let internal_lwe_dimensions: Vec<u64> = DEFAUT_DOMAINS.free_glwe.glwe_dimension.as_vec();
+
+        Self {
+            glwe_log_polynomial_sizes,
+            glwe_dimensions,
+            internal_lwe_dimensions,
+        }
+    }
+
+    pub fn default_gpu_amortized() -> Self {
+        // https://github.com/zama-ai/concrete-core/blob/6b52182ab44c4b39ddebca1c457e1096fb687801/concrete-cuda/cuda/src/bootstrap_amortized.cu#L79
+        let glwe_log_polynomial_sizes: Vec<u64> = (9..=13).collect();
+
+        // https://github.com/zama-ai/concrete-core/blob/6b52182ab44c4b39ddebca1c457e1096fb687801/concrete-cuda/cuda/src/bootstrap_amortized.cu#L78
+        let glwe_dimensions: Vec<u64> = vec![1];
+
+        let internal_lwe_dimensions: Vec<u64> = DEFAUT_DOMAINS.free_glwe.glwe_dimension.as_vec();
+
+        Self {
+            glwe_log_polynomial_sizes,
+            glwe_dimensions,
+            internal_lwe_dimensions,
+        }
+    }
+    pub fn default(processing_unit: config::ProcessingUnit) -> Self {
+        match processing_unit {
+            config::ProcessingUnit::Cpu => Self::default_cpu(),
+            config::ProcessingUnit::Gpu {
+                pbs_type: GpuPbsType::Amortized,
+                ..
+            } => Self::default_gpu_amortized(),
+            config::ProcessingUnit::Gpu {
+                pbs_type: GpuPbsType::Lowlat,
+                ..
+            } => Self::default_gpu_lowlat(),
+        }
+    }
 }
+
+// https://github.com/zama-ai/concrete-core/blob/6b52182ab44c4b39ddebca1c457e1096fb687801/concrete-cuda/cuda/src/bootstrap_amortized.cu#L77
+// https://github.com/zama-ai/concrete-core/blob/6b52182ab44c4b39ddebca1c457e1096fb687801/concrete-cuda/cuda/src/bootstrap_low_latency.cu#L153
+pub const MAX_LOG2_BASE_GPU: u64 = 16;
+
+pub const MAX_LOG2_BASE_CPU: u64 = 64;
--- a/concrete-optimizer/src/optimization/dag/solo_key/optimize.rs
+++ b/concrete-optimizer/src/optimization/dag/solo_key/optimize.rs
@@ -359,12 +359,12 @@ mod tests {

    use super::*;
    use crate::computing_cost::cpu::CpuComplexity;
+    use crate::config;
    use crate::dag::operator::{FunctionTable, Shape, Weights};
    use crate::noise_estimator::p_error::repeat_p_error;
-    use crate::optimization::atomic_pattern;
    use crate::optimization::config::SearchSpace;
    use crate::optimization::dag::solo_key::symbolic_variance::VarianceOrigin;
-    use crate::optimization::decomposition;
+    use crate::optimization::{atomic_pattern, decomposition};
    use crate::utils::square;

    fn small_relative_diff(v1: f64, v2: f64) -> bool {
@@ -399,7 +399,7 @@ mod tests {
            complexity_model: &CpuComplexity::default(),
        };

-        let search_space = SearchSpace::default();
+        let search_space = SearchSpace::default_cpu();

        super::optimize(dag, config, &search_space, cache)
    }
@@ -429,7 +429,9 @@ mod tests {
    }

    fn v0_parameter_ref(precision: u64, weight: u64, times: &mut Times) {
-        let search_space = SearchSpace::default();
+        let processing_unit = config::ProcessingUnit::Cpu;
+
+        let search_space = SearchSpace::default(processing_unit);

        let sum_size = 1;

@@ -440,7 +442,7 @@ mod tests {
            complexity_model: &CpuComplexity::default(),
        };

-        let cache = decomposition::cache(config.security_level);
+        let cache = decomposition::cache(config.security_level, processing_unit, None);

        let _ = optimize_v0(
            sum_size,
@@ -499,9 +501,10 @@ mod tests {
    }

    fn v0_parameter_ref_with_dot(precision: Precision, weight: i64) {
+        let processing_unit = config::ProcessingUnit::Cpu;
        let security_level = 128;

-        let cache = decomposition::cache(security_level);
+        let cache = decomposition::cache(security_level, processing_unit, None);

        let mut dag = unparametrized::OperationDag::new();
        {
@@ -530,7 +533,7 @@ mod tests {
            assert_f64_eq(square(weight) as f64, constraint.pareto_in_lut[0].lut_coeff);
        }

-        let search_space = SearchSpace::default();
+        let search_space = SearchSpace::default(processing_unit);

        let config = Config {
            security_level,
@@ -589,7 +592,8 @@ mod tests {
    }
    #[test]
    fn test_lut_vs_no_lut() {
-        let cache = decomposition::cache(128);
+        let processing_unit = config::ProcessingUnit::Cpu;
+        let cache = decomposition::cache(128, processing_unit, None);
        for precision in 1..=8 {
            no_lut_vs_lut(precision, &cache);
        }
@@ -632,7 +636,8 @@ mod tests {

    #[test]
    fn test_lut_with_input_base_noise_better_than_lut_with_lut_base_noise() {
-        let cache = decomposition::cache(128);
+        let processing_unit = config::ProcessingUnit::Cpu;
+        let cache = decomposition::cache(128, processing_unit, None);
        for log_weight in 1..=16 {
            let weight = 1 << log_weight;
            for precision in 5..=9 {
@@ -666,7 +671,8 @@ mod tests {

    #[test]
    fn test_lut_1_layer_is_better() {
-        let cache = decomposition::cache(128);
+        let processing_unit = config::ProcessingUnit::Cpu;
+        let cache = decomposition::cache(128, processing_unit, None);
        // for some reason on 4, 5, 6, the complexity is already minimal
        // this could be due to pre-defined pareto set
        for precision in [1, 2, 3, 7, 8] {
@@ -722,7 +728,8 @@ mod tests {

    #[test]
    fn test_multi_precision_dominate_single() {
-        let cache = decomposition::cache(128);
+        let processing_unit = config::ProcessingUnit::Cpu;
+        let cache = decomposition::cache(128, processing_unit, None);
        let mut prev = Some(true); // true -> ... -> true -> false -> ... -> false
        for log2_weight in 0..29 {
            let weight = 1 << log2_weight;
@@ -756,7 +763,8 @@ mod tests {

    #[test]
    fn test_global_p_error_input() {
-        let cache = decomposition::cache(128);
+        let processing_unit = config::ProcessingUnit::Cpu;
+        let cache = decomposition::cache(128, processing_unit, None);
        for precision in [4_u8, 8] {
            for weight in [1, 3, 27, 243, 729] {
                for dim in [1, 2, 16, 32] {
@@ -786,7 +794,8 @@ mod tests {

    #[test]
    fn test_global_p_error_lut() {
-        let cache = decomposition::cache(128);
+        let processing_unit = config::ProcessingUnit::Cpu;
+        let cache = decomposition::cache(128, processing_unit, None);
        for precision in [4_u8, 8] {
            for weight in [1, 3, 27, 243, 729] {
                for depth in [2, 16, 32] {
@@ -847,7 +856,8 @@ mod tests {
    #[allow(clippy::unnecessary_cast)] // clippy bug refusing as Precision on const
    #[test]
    fn test_global_p_error_dominating_lut() {
-        let cache = decomposition::cache(128);
+        let processing_unit = config::ProcessingUnit::Cpu;
+        let cache = decomposition::cache(128, processing_unit, None);
        let depth = 128;
        let weights_low = 1;
        let weights_high = 1;
@@ -875,7 +885,8 @@ mod tests {
    #[allow(clippy::unnecessary_cast)] // clippy bug refusing as Precision on const
    #[test]
    fn test_global_p_error_non_dominating_lut() {
-        let cache = decomposition::cache(128);
+        let processing_unit = config::ProcessingUnit::Cpu;
+        let cache = decomposition::cache(128, processing_unit, None);
        let depth = 128;
        let weights_low = 1024 * 1024 * 3;
        let weights_high = 1;
--- a/concrete-optimizer/src/optimization/decomposition/blind_rotate.rs
+++ b/concrete-optimizer/src/optimization/decomposition/blind_rotate.rs
@@ -1,13 +1,15 @@
+use std::sync::Arc;
+
 use serde::{Deserialize, Serialize};

 use concrete_commons::dispersion::DispersionParameter;

-use crate::computing_cost::operators::pbs::PbsComplexity;
+use crate::computing_cost::complexity_model::ComplexityModel;
 use crate::noise_estimator::operators::atomic_pattern as noise_atomic_pattern;
 use crate::parameters::{BrDecompositionParameters, GlweParameters, LweDimension, PbsParameters};
-use crate::security;
 use crate::utils::cache::ephemeral::{CacheHashMap, EphemeralCache};
 use crate::utils::cache::persistent::PersistentCacheHashMap;
+use crate::{config, security};

 use super::common::MacroParam;
 use super::cut::ComplexityNoise;
@@ -21,10 +23,12 @@ pub struct BrComplexityNoise {

 /* This is stricly variance decreasing and strictly complexity increasing */
 pub fn pareto_quantities(
+    complexity_model: &dyn ComplexityModel,
    ciphertext_modulus_log: u32,
    security_level: u64,
    internal_dim: u64,
    glwe_params: GlweParameters,
+    max_log2_base: u64,
 ) -> Vec<BrComplexityNoise> {
    assert!(ciphertext_modulus_log == 64);
    let pbs_param = |level, log2_base| {
@@ -38,24 +42,22 @@ pub fn pareto_quantities(
    let variance_bsk =
        security::glwe::minimal_variance(glwe_params, ciphertext_modulus_log, security_level);

-    let mut quantities = Vec::with_capacity(64);
+    let mut quantities = Vec::with_capacity(max_log2_base as usize);
    let mut increasing_complexity = 0.0;
    let mut decreasing_variance = f64::INFINITY;
    let mut counting_no_progress = 0;
-    let mut prev_best_log2_base = 0_u64;
-    let max_level = ciphertext_modulus_log as u64;
-    for level in 1..=max_level {
+
+    let mut prev_best_log2_base = max_log2_base;
+
+    for level in 1..=ciphertext_modulus_log as u64 {
        // detect increasing noise
        let mut level_decreasing_base_noise = f64::INFINITY;
        let mut best_log2_base = 0_u64;
-        let range: Vec<_> = if level == 1 {
-            (1..=(max_level / level)).collect()
-        } else {
-            // we know a max is between 1 and prev_best_log2_base
-            // and the curve has only 1 maximum close to prev_best_log2_base
-            // so we start on prev_best_log2_base
-            (1..=prev_best_log2_base).rev().collect()
-        };
+        // we know a max is between 1 and prev_best_log2_base
+        // and the curve has only 1 maximum close to prev_best_log2_base
+        // so we start on prev_best_log2_base
+        let range = (1..=prev_best_log2_base).rev();
+
        for log2_base in range {
            let base_noise = noise_atomic_pattern::variance_bootstrap(
                pbs_param(level, log2_base),
@@ -81,7 +83,7 @@ pub fn pareto_quantities(
            continue;
        }
        let params = pbs_param(level, best_log2_base);
-        let complexity_pbs = PbsComplexity::default().complexity(params, ciphertext_modulus_log);
+        let complexity_pbs = complexity_model.pbs_complexity(params, ciphertext_modulus_log);

        quantities.push(BrComplexityNoise {
            decomp: params.br_decomposition_parameter,
@@ -118,19 +120,33 @@ impl Cache {

 pub type PersistDecompCache = PersistentCacheHashMap<MacroParam, Vec<BrComplexityNoise>>;

-pub fn cache(security_level: u64) -> PersistDecompCache {
+pub fn cache(
+    security_level: u64,
+    processing_unit: config::ProcessingUnit,
+    complexity_model: Option<Arc<dyn ComplexityModel>>,
+) -> PersistDecompCache {
+    let max_log2_base = processing_unit.max_br_base_log();
+
    let ciphertext_modulus_log = 64;
    let tmp: String = std::env::temp_dir()
        .to_str()
        .expect("Invalid tmp dir")
        .into();
-    let path = format!("{tmp}/optimizer/cache/br-decomp-cpu-64-{security_level}");
+
+    let hardware = processing_unit.br_to_string();
+
+    let path = format!("{tmp}/optimizer/cache/br-decomp-{hardware}-64-{security_level}");
+
+    let complexity_model = complexity_model.unwrap_or_else(|| processing_unit.complexity_model());
+
    let function = move |(glwe_params, internal_dim): MacroParam| {
        pareto_quantities(
+            complexity_model.as_ref(),
            ciphertext_modulus_log,
            security_level,
            internal_dim,
            glwe_params,
+            max_log2_base,
        )
    };
    PersistentCacheHashMap::new(&path, "v0", function)
--- a/concrete-optimizer/src/optimization/decomposition/keyswitch.rs
+++ b/concrete-optimizer/src/optimization/decomposition/keyswitch.rs
@@ -1,8 +1,11 @@
+use std::sync::Arc;
+
 use serde::{Deserialize, Serialize};

 use concrete_commons::dispersion::DispersionParameter;

-use crate::computing_cost::operators::keyswitch_lwe::KsComplexity;
+use crate::computing_cost::complexity_model::ComplexityModel;
+use crate::config;
 use crate::noise_estimator::operators::atomic_pattern as noise_atomic_pattern;
 use crate::parameters::{
    GlweParameters, KeyswitchParameters, KsDecompositionParameters, LweDimension,
@@ -31,6 +34,7 @@ impl ComplexityNoise for KsComplexityNoise {

 /* This is stricly variance decreasing and strictly complexity increasing */
 pub fn pareto_quantities(
+    complexity_model: &dyn ComplexityModel,
    ciphertext_modulus_log: u32,
    security_level: u64,
    internal_dim: u64,
@@ -54,20 +58,18 @@ pub fn pareto_quantities(
    let mut increasing_complexity = 0.0;
    let mut decreasing_variance = f64::INFINITY;
    let mut counting_no_progress = 0;
-    let mut prev_best_log2_base = 0_u64;
-    let max_level = ciphertext_modulus_log as u64;
-    for level in 1..=max_level {
+    let mut prev_best_log2_base = ciphertext_modulus_log as u64;
+
+    for level in 1..=ciphertext_modulus_log as u64 {
        // detect increasing noise
        let mut level_decreasing_base_noise = f64::INFINITY;
        let mut best_log2_base = 0_u64;
-        let range: Vec<_> = if level == 1 {
-            (1..=(max_level / level)).collect()
-        } else {
-            // we know a max is between 1 and prev_best_log2_base
-            // and the curve has only 1 maximum close to prev_best_log2_base
-            // so we start on prev_best_log2_base
-            (1..=prev_best_log2_base).rev().collect()
-        };
+
+        // we know a max is between 1 and prev_best_log2_base
+        // and the curve has only 1 maximum close to prev_best_log2_base
+        // so we start on prev_best_log2_base
+        let range = (1..=prev_best_log2_base).rev();
+
        for log2_base in range {
            let noise_keyswitch = noise_atomic_pattern::variance_keyswitch(
                ks_param(level, log2_base),
@@ -93,7 +95,8 @@ pub fn pareto_quantities(
            continue;
        }
        let ks_params = ks_param(level, best_log2_base);
-        let complexity_keyswitch = KsComplexity.complexity(ks_params, ciphertext_modulus_log);
+        let complexity_keyswitch =
+            complexity_model.ks_complexity(ks_params, ciphertext_modulus_log);
        quantities.push(KsComplexityNoise {
            decomp: ks_params.ks_decomposition_parameter,
            noise: level_decreasing_base_noise,
@@ -120,15 +123,26 @@ impl Cache {

 pub type PersistDecompCache = PersistentCacheHashMap<MacroParam, Vec<KsComplexityNoise>>;

-pub fn cache(security_level: u64) -> PersistDecompCache {
+pub fn cache(
+    security_level: u64,
+    processing_unit: config::ProcessingUnit,
+    complexity_model: Option<Arc<dyn ComplexityModel>>,
+) -> PersistDecompCache {
    let ciphertext_modulus_log = 64;
    let tmp: String = std::env::temp_dir()
        .to_str()
        .expect("Invalid tmp dir")
        .into();
-    let path = format!("{tmp}/optimizer/cache/ks-decomp-cpu-64-{security_level}");
+
+    let hardware = processing_unit.ks_to_string();
+
+    let path = format!("{tmp}/optimizer/cache/ks-decomp-{hardware}-64-{security_level}");
+
+    let complexity_model = complexity_model.unwrap_or_else(|| processing_unit.complexity_model());
+
    let function = move |(glwe_params, internal_dim): MacroParam| {
        pareto_quantities(
+            complexity_model.as_ref(),
            ciphertext_modulus_log,
            security_level,
            internal_dim,
--- a/concrete-optimizer/src/optimization/decomposition/mod.rs
+++ b/concrete-optimizer/src/optimization/decomposition/mod.rs
@@ -3,17 +3,30 @@ pub mod common;
 pub mod cut;
 pub mod keyswitch;

+use std::sync::Arc;
+
 pub use common::MacroParam;
 pub use cut::cut_complexity_noise;

+use crate::computing_cost::complexity_model::ComplexityModel;
+use crate::config;
+
 pub struct PersistDecompCache {
    pub ks: keyswitch::PersistDecompCache,
    pub br: blind_rotate::PersistDecompCache,
 }

-pub fn cache(security_level: u64) -> PersistDecompCache {
+pub fn cache(
+    security_level: u64,
+    processing_unit: config::ProcessingUnit,
+    complexity_model: Option<Arc<dyn ComplexityModel>>,
+) -> PersistDecompCache {
    PersistDecompCache {
-        ks: keyswitch::cache(security_level),
-        br: blind_rotate::cache(security_level),
+        ks: keyswitch::cache(security_level, processing_unit, complexity_model.clone()),
+        br: blind_rotate::cache(security_level, processing_unit, complexity_model),
    }
 }
+
+trait ComplexityModelClone: ComplexityModel + Clone {}
+
+impl<T: ComplexityModel + Clone> ComplexityModelClone for T {}
--- a/v0-parameters/src/lib.rs
+++ b/v0-parameters/src/lib.rs
@@ -10,6 +10,7 @@

 use clap::Parser;
 use concrete_optimizer::computing_cost::cpu::CpuComplexity;
+use concrete_optimizer::config;
 use concrete_optimizer::global_parameters::DEFAUT_DOMAINS;
 use concrete_optimizer::optimization::atomic_pattern::{
    self as optimize_atomic_pattern, OptimizationState,
@@ -84,6 +85,8 @@ pub struct Args {
 }

 pub fn all_results(args: &Args) -> Vec<Vec<OptimizationState>> {
+    let processing_unit = config::ProcessingUnit::Cpu;
+
    let sum_size = args.sum_size;
    let maximum_acceptable_error_probability = args.p_error;
    let security_level = args.security_level;
@@ -108,7 +111,7 @@ pub fn all_results(args: &Args) -> Vec<Vec<OptimizationState>> {
        complexity_model: &CpuComplexity::default(),
    };

-    let cache = decomposition::cache(config.security_level);
+    let cache = decomposition::cache(config.security_level, processing_unit, None);

    precisions_iter
        .map(|precision| {