feat(hpu): Add Hpu backend implementation

This backend abstract communication with Hpu Fpga hardware. It define it's proper entities to prevent circular dependencies with tfhe-rs. Object lifetime is handle through Arc<Mutex<T>> wrapper, and enforce that all objects currently alive in Hpu Hw are also kept valid on the host side. It contains the second version of HPU instruction set (HIS_V2.0): * DOp have following properties: + Template as first class citizen + Support of Immediate template + Direct parser and conversion between Asm/Hex + Replace deku (and it's associated endianess limitation) by + bitfield_struct and manual parsing * IOp have following properties: + Support various number of Destination + Support various number of Sources + Support various number of Immediat values + Support of multiple bitwidth (Not implemented yet in the Fpga firmware) Details could be view in `backends/tfhe-hpu-backend/Readme.md`
2026-01-08 22:28:01 -05:00 · 2025-05-16 14:15:38 +02:00
parent a7d8d2b1d4
commit 9ee8259002
301 changed files with 46112 additions and 461 deletions
--- a/tfhe-benchmark/Cargo.toml
+++ b/tfhe-benchmark/Cargo.toml
@@ -35,6 +35,8 @@ boolean = ["tfhe/boolean"]
 shortint = ["tfhe/shortint"]
 integer = ["shortint", "tfhe/integer"]
 gpu = ["tfhe/gpu"]
+hpu = ["tfhe/hpu"]
+hpu-v80 = ["tfhe/hpu-v80"]
 internal-keycache = ["tfhe/internal-keycache"]
 nightly-avx512 = ["tfhe/nightly-avx512"]
 pbs-stats = ["tfhe/pbs-stats"]
--- a/tfhe-benchmark/benches/core_crypto/pbs_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/pbs_bench.rs
@@ -726,7 +726,11 @@ fn mem_optimized_pbs_ntt(c: &mut Criterion) {
            bsk.ciphertext_modulus(),
        );

-        par_convert_standard_lwe_bootstrap_key_to_ntt64(&bsk, &mut nbsk);
+        par_convert_standard_lwe_bootstrap_key_to_ntt64(
+            &bsk,
+            &mut nbsk,
+            NttLweBootstrapKeyOption::Normalize,
+        );

        drop(bsk);

--- a/tfhe-benchmark/benches/high_level_api/bench.rs
+++ b/tfhe-benchmark/benches/high_level_api/bench.rs
@@ -1,17 +1,17 @@
-use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
 use criterion::{black_box, Criterion};
 use rand::prelude::*;
 use std::fmt::Write;
 use std::ops::*;
 use tfhe::prelude::*;
 use tfhe::{
-    set_server_key, ClientKey, CompressedServerKey, ConfigBuilder, FheUint10, FheUint12,
-    FheUint128, FheUint14, FheUint16, FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8,
+    ClientKey, CompressedServerKey, FheUint10, FheUint12, FheUint128, FheUint14, FheUint16,
+    FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8,
 };

 fn bench_fhe_type<FheType>(c: &mut Criterion, client_key: &ClientKey, type_name: &str)
 where
    FheType: FheEncrypt<u128, ClientKey>,
+    FheType: FheWait,
    for<'a> &'a FheType: Add<&'a FheType, Output = FheType>
        + Sub<&'a FheType, Output = FheType>
        + Mul<&'a FheType, Output = FheType>
@@ -35,54 +35,133 @@ where
    let mut name = String::with_capacity(255);

    write!(name, "add({type_name}, {type_name})").unwrap();
-    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs + &rhs)));
-    name.clear();
-
-    write!(name, "overflowing_add({type_name}, {type_name})").unwrap();
    bench_group.bench_function(&name, |b| {
-        b.iter(|| black_box((&lhs).overflowing_add(&rhs)))
+        b.iter(|| {
+            let res = &lhs + &rhs;
+            res.wait();
+            black_box(res)
+        })
    });
    name.clear();

-    write!(name, "overflowing_sub({type_name}, {type_name})").unwrap();
-    bench_group.bench_function(&name, |b| b.iter(|| black_box(lhs.overflowing_sub(&rhs))));
-    name.clear();
+    #[cfg(not(feature = "hpu"))]
+    {
+        write!(name, "overflowing_add({type_name}, {type_name})").unwrap();
+        bench_group.bench_function(&name, |b| {
+            b.iter(|| {
+                let (res, flag) = lhs.overflowing_add(&rhs);
+                res.wait();
+                black_box((res, flag))
+            })
+        });
+        name.clear();
+    }
+
+    #[cfg(not(feature = "hpu"))]
+    {
+        write!(name, "overflowing_sub({type_name}, {type_name})").unwrap();
+        bench_group.bench_function(&name, |b| {
+            b.iter(|| {
+                let (res, flag) = lhs.overflowing_sub(&rhs);
+                res.wait();
+                black_box((res, flag))
+            })
+        });
+        name.clear();
+    }

    write!(name, "sub({type_name}, {type_name})").unwrap();
-    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs - &rhs)));
+    bench_group.bench_function(&name, |b| {
+        b.iter(|| {
+            let res = &lhs - &rhs;
+            res.wait();
+            black_box(res)
+        })
+    });
    name.clear();

    write!(name, "mul({type_name}, {type_name})").unwrap();
-    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs * &rhs)));
+    bench_group.bench_function(&name, |b| {
+        b.iter(|| {
+            let res = &lhs * &rhs;
+            res.wait();
+            black_box(res)
+        })
+    });
    name.clear();

    write!(name, "bitand({type_name}, {type_name})").unwrap();
-    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs & &rhs)));
+    bench_group.bench_function(&name, |b| {
+        b.iter(|| {
+            let res = &lhs & &rhs;
+            res.wait();
+            black_box(res)
+        })
+    });
    name.clear();

    write!(name, "bitor({type_name}, {type_name})").unwrap();
-    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs | &rhs)));
+    bench_group.bench_function(&name, |b| {
+        b.iter(|| {
+            let res = &lhs | &rhs;
+            res.wait();
+            black_box(res)
+        })
+    });
    name.clear();

    write!(name, "bitxor({type_name}, {type_name})").unwrap();
-    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs ^ &rhs)));
+    bench_group.bench_function(&name, |b| {
+        b.iter(|| {
+            let res = &lhs ^ &rhs;
+            res.wait();
+            black_box(res)
+        })
+    });
    name.clear();

-    write!(name, "shl({type_name}, {type_name})").unwrap();
-    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs << &rhs)));
-    name.clear();
+    #[cfg(not(feature = "hpu"))]
+    {
+        write!(name, "shl({type_name}, {type_name})").unwrap();
+        bench_group.bench_function(&name, |b| {
+            b.iter(|| {
+                let res = &lhs << &rhs;
+                res.wait();
+                black_box(res)
+            })
+        });
+        name.clear();

-    write!(name, "shr({type_name}, {type_name})").unwrap();
-    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs >> &rhs)));
-    name.clear();
+        write!(name, "shr({type_name}, {type_name})").unwrap();
+        bench_group.bench_function(&name, |b| {
+            b.iter(|| {
+                let res = &lhs >> &rhs;
+                res.wait();
+                black_box(res)
+            })
+        });
+        name.clear();

-    write!(name, "rotl({type_name}, {type_name})").unwrap();
-    bench_group.bench_function(&name, |b| b.iter(|| black_box((&lhs).rotate_left(&rhs))));
-    name.clear();
+        write!(name, "rotl({type_name}, {type_name})").unwrap();
+        bench_group.bench_function(&name, |b| {
+            b.iter(|| {
+                let res = (&lhs).rotate_left(&rhs);
+                res.wait();
+                black_box(res)
+            })
+        });
+        name.clear();

-    write!(name, "rotr({type_name}, {type_name})").unwrap();
-    bench_group.bench_function(&name, |b| b.iter(|| black_box((&lhs).rotate_right(&rhs))));
-    name.clear();
+        write!(name, "rotr({type_name}, {type_name})").unwrap();
+        bench_group.bench_function(&name, |b| {
+            b.iter(|| {
+                let res = (&lhs).rotate_right(&rhs);
+                res.wait();
+                black_box(res)
+            })
+        });
+        name.clear();
+    }
 }

 macro_rules! bench_type {
@@ -108,13 +187,39 @@ bench_type!(FheUint64);
 bench_type!(FheUint128);

 fn main() {
-    let config =
-        ConfigBuilder::with_custom_parameters(BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
-            .build();
-    let cks = ClientKey::generate(config);
-    let compressed_sks = CompressedServerKey::new(&cks);
+    #[cfg(feature = "hpu")]
+    let cks = {
+        // Hpu is enable, start benchmark on Hpu hw accelerator
+        use tfhe::tfhe_hpu_backend::prelude::*;
+        use tfhe::{set_server_key, Config};

-    set_server_key(compressed_sks.decompress());
+        // Use environment variable to construct path to configuration file
+        let config_path = ShellString::new(
+            "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml".to_string(),
+        );
+        let hpu_device = HpuDevice::from_config(&config_path.expand());
+
+        let config = Config::from_hpu_device(&hpu_device);
+        let cks = ClientKey::generate(config);
+        let compressed_sks = CompressedServerKey::new(&cks);
+
+        set_server_key((hpu_device, compressed_sks));
+        cks
+    };
+    #[cfg(not(feature = "hpu"))]
+    let cks = {
+        use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+        use tfhe::{set_server_key, ConfigBuilder};
+        let config = ConfigBuilder::with_custom_parameters(
+            BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        )
+        .build();
+        let cks = ClientKey::generate(config);
+        let compressed_sks = CompressedServerKey::new(&cks);
+
+        set_server_key(compressed_sks.decompress());
+        cks
+    };

    let mut c = Criterion::default().configure_from_args();

--- a/tfhe-benchmark/benches/high_level_api/erc20.rs
+++ b/tfhe-benchmark/benches/high_level_api/erc20.rs
@@ -1,21 +1,22 @@
 #[cfg(feature = "gpu")]
-use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
-#[cfg(not(feature = "gpu"))]
-use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
-#[cfg(feature = "gpu")]
 use benchmark::utilities::configure_gpu;
 use benchmark::utilities::{write_to_json, OperatorType};
 use criterion::measurement::WallTime;
 use criterion::{BenchmarkGroup, Criterion, Throughput};
 use rand::prelude::*;
 use rand::thread_rng;
+#[cfg(not(feature = "hpu"))]
 use rayon::prelude::*;
-use std::ops::{Add, Mul, Sub};
+#[cfg(not(feature = "hpu"))]
+use std::ops::Mul;
+use std::ops::{Add, Sub};
+#[cfg(feature = "gpu")]
+use tfhe::core_crypto::gpu::get_number_of_gpus;
 use tfhe::keycache::NamedParam;
 use tfhe::prelude::*;
 #[cfg(feature = "gpu")]
 use tfhe::GpuIndex;
-use tfhe::{set_server_key, ClientKey, CompressedServerKey, ConfigBuilder, FheBool, FheUint64};
+use tfhe::{set_server_key, ClientKey, CompressedServerKey, FheBool, FheUint64};

 /// Transfer as written in the original FHEvm white-paper,
 /// it uses a comparison to check if the sender has enough,
@@ -25,6 +26,28 @@ pub fn transfer_whitepaper<FheType>(
    to_amount: &FheType,
    amount: &FheType,
 ) -> (FheType, FheType)
+where
+    FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType>,
+    FheBool: IfThenElse<FheType>,
+    for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
+{
+    let has_enough_funds = (from_amount).ge(amount);
+
+    let mut new_to_amount = to_amount + amount;
+    new_to_amount = has_enough_funds.if_then_else(&new_to_amount, to_amount);
+
+    let mut new_from_amount = from_amount - amount;
+    new_from_amount = has_enough_funds.if_then_else(&new_from_amount, from_amount);
+
+    (new_from_amount, new_to_amount)
+}
+
+/// Parallel variant of [`transfer_whitepaper`].
+pub fn par_transfer_whitepaper<FheType>(
+    from_amount: &FheType,
+    to_amount: &FheType,
+    amount: &FheType,
+) -> (FheType, FheType)
 where
    FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType> + Send + Sync,
    FheBool: IfThenElse<FheType>,
@@ -48,6 +71,7 @@ where

 /// This one also uses a comparison, but it leverages the 'boolean' multiplication
 /// instead of cmuxes, so it is faster
+#[cfg(not(feature = "hpu"))]
 fn transfer_no_cmux<FheType>(
    from_amount: &FheType,
    to_amount: &FheType,
@@ -71,6 +95,7 @@ where

 /// This one uses overflowing sub to remove the need for comparison
 /// it also uses the 'boolean' multiplication
+#[cfg(not(feature = "hpu"))]
 fn transfer_overflow<FheType>(
    from_amount: &FheType,
    to_amount: &FheType,
@@ -97,6 +122,7 @@ where

 /// This ones uses both overflowing_add/sub to check that both
 /// the sender has enough funds, and the receiver will not overflow its balance
+#[cfg(not(feature = "hpu"))]
 fn transfer_safe<FheType>(
    from_amount: &FheType,
    to_amount: &FheType,
@@ -123,7 +149,30 @@ where
    (new_from_amount, new_to_amount)
 }

-#[cfg(feature = "pbs-stats")]
+#[cfg(feature = "hpu")]
+/// This one use a dedicated IOp inside Hpu
+fn transfer_hpu<FheType>(
+    from_amount: &FheType,
+    to_amount: &FheType,
+    amount: &FheType,
+) -> (FheType, FheType)
+where
+    FheType: FheHpu,
+{
+    use tfhe::tfhe_hpu_backend::prelude::hpu_asm;
+    let src = HpuHandle {
+        native: vec![from_amount, to_amount, amount],
+        boolean: vec![],
+        imm: vec![],
+    };
+    let mut res_handle = FheHpu::iop_exec(&hpu_asm::iop::IOP_ERC_20, src);
+    // Iop erc_20 return new_from, new_to
+    let new_to = res_handle.native.pop().unwrap();
+    let new_from = res_handle.native.pop().unwrap();
+    (new_from, new_to)
+}
+
+#[cfg(all(feature = "pbs-stats", not(feature = "hpu")))]
 mod pbs_stats {
    use super::*;
    use std::fs::{File, OpenOptions};
@@ -200,6 +249,7 @@ fn bench_transfer_latency<FheType, F>(
    transfer_func: F,
 ) where
    FheType: FheEncrypt<u64, ClientKey>,
+    FheType: FheWait,
    F: for<'a> Fn(&'a FheType, &'a FheType, &'a FheType) -> (FheType, FheType),
 {
    #[cfg(feature = "gpu")]
@@ -214,7 +264,11 @@ fn bench_transfer_latency<FheType, F>(
        let amount = FheType::encrypt(rng.gen::<u64>(), client_key);

        b.iter(|| {
-            let (_, _) = transfer_func(&from_amount, &to_amount, &amount);
+            let (new_from, new_to) = transfer_func(&from_amount, &to_amount, &amount);
+            new_from.wait();
+            criterion::black_box(new_from);
+            new_to.wait();
+            criterion::black_box(new_to);
        })
    });

@@ -231,7 +285,7 @@ fn bench_transfer_latency<FheType, F>(
    );
 }

-#[cfg(not(feature = "gpu"))]
+#[cfg(not(any(feature = "gpu", feature = "hpu")))]
 fn bench_transfer_throughput<FheType, F>(
    group: &mut BenchmarkGroup<'_, WallTime>,
    client_key: &ClientKey,
@@ -283,6 +337,7 @@ fn bench_transfer_throughput<FheType, F>(
        );
    }
 }
+
 #[cfg(feature = "gpu")]
 fn cuda_bench_transfer_throughput<FheType, F>(
    group: &mut BenchmarkGroup<'_, WallTime>,
@@ -370,16 +425,75 @@ fn cuda_bench_transfer_throughput<FheType, F>(
    }
 }

-#[cfg(feature = "pbs-stats")]
-use pbs_stats::print_transfer_pbs_counts;
-#[cfg(feature = "gpu")]
-use tfhe::core_crypto::gpu::get_number_of_gpus;
+#[cfg(feature = "hpu")]
+fn hpu_bench_transfer_throughput<FheType, F>(
+    group: &mut BenchmarkGroup<'_, WallTime>,
+    client_key: &ClientKey,
+    bench_name: &str,
+    type_name: &str,
+    fn_name: &str,
+    transfer_func: F,
+) where
+    FheType: FheEncrypt<u64, ClientKey> + Send + Sync,
+    FheType: FheWait,
+    F: for<'a> Fn(&'a FheType, &'a FheType, &'a FheType) -> (FheType, FheType) + Sync,
+{
+    let mut rng = thread_rng();

-#[cfg(not(feature = "gpu"))]
+    for num_elems in [10, 100] {
+        group.throughput(Throughput::Elements(num_elems));
+        let bench_id =
+            format!("{bench_name}::throughput::{fn_name}::{type_name}::{num_elems}_elems");
+        group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| {
+            let from_amounts = (0..num_elems)
+                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
+                .collect::<Vec<_>>();
+            let to_amounts = (0..num_elems)
+                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
+                .collect::<Vec<_>>();
+            let amounts = (0..num_elems)
+                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
+                .collect::<Vec<_>>();
+
+            b.iter(|| {
+                let (last_new_from, last_new_to) = std::iter::zip(
+                    from_amounts.iter(),
+                    std::iter::zip(to_amounts.iter(), amounts.iter()),
+                )
+                .map(|(from_amount, (to_amount, amount))| {
+                    transfer_func(from_amount, to_amount, amount)
+                })
+                .last()
+                .unwrap();
+
+                // Wait on last result to enforce all computation is over
+                last_new_from.wait();
+                criterion::black_box(last_new_from);
+                last_new_to.wait();
+                criterion::black_box(last_new_to);
+            });
+        });
+
+        let params = client_key.computation_parameters();
+
+        write_to_json::<u64, _>(
+            &bench_id,
+            params,
+            params.name(),
+            "erc20-transfer",
+            &OperatorType::Atomic,
+            64,
+            vec![],
+        );
+    }
+}
+
+#[cfg(not(any(feature = "gpu", feature = "hpu")))]
 fn main() {
-    let params = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    use crate::pbs_stats::print_transfer_pbs_counts;
+    let params = benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;

-    let config = ConfigBuilder::with_custom_parameters(params).build();
+    let config = tfhe::ConfigBuilder::with_custom_parameters(params).build();
    let cks = ClientKey::generate(config);
    let compressed_sks = CompressedServerKey::new(&cks);

@@ -401,7 +515,7 @@ fn main() {
            &cks,
            "FheUint64",
            "transfer::whitepaper",
-            transfer_whitepaper::<FheUint64>,
+            par_transfer_whitepaper::<FheUint64>,
        );
        print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::<FheUint64>);
        print_transfer_pbs_counts(
@@ -422,7 +536,7 @@ fn main() {
            bench_name,
            "FheUint64",
            "transfer::whitepaper",
-            transfer_whitepaper::<FheUint64>,
+            par_transfer_whitepaper::<FheUint64>,
        );
        bench_transfer_latency(
            &mut group,
@@ -461,7 +575,7 @@ fn main() {
            bench_name,
            "FheUint64",
            "transfer::whitepaper",
-            transfer_whitepaper::<FheUint64>,
+            par_transfer_whitepaper::<FheUint64>,
        );
        bench_transfer_throughput(
            &mut group,
@@ -496,9 +610,10 @@ fn main() {

 #[cfg(feature = "gpu")]
 fn main() {
-    let params = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    use crate::pbs_stats::print_transfer_pbs_counts;
+    let params = benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;

-    let config = ConfigBuilder::with_custom_parameters(params).build();
+    let config = tfhe::ConfigBuilder::with_custom_parameters(params).build();
    let cks = ClientKey::generate(config);

    let mut c = Criterion::default().sample_size(10).configure_from_args();
@@ -514,7 +629,7 @@ fn main() {
            &cks,
            "FheUint64",
            "transfer::whitepaper",
-            transfer_whitepaper::<FheUint64>,
+            par_transfer_whitepaper::<FheUint64>,
        );
        print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::<FheUint64>);
        print_transfer_pbs_counts(
@@ -535,7 +650,7 @@ fn main() {
            bench_name,
            "FheUint64",
            "transfer::whitepaper",
-            transfer_whitepaper::<FheUint64>,
+            par_transfer_whitepaper::<FheUint64>,
        );
        bench_transfer_latency(
            &mut group,
@@ -574,7 +689,7 @@ fn main() {
            bench_name,
            "FheUint64",
            "transfer::whitepaper",
-            transfer_whitepaper::<FheUint64>,
+            par_transfer_whitepaper::<FheUint64>,
        );
        cuda_bench_transfer_throughput(
            &mut group,
@@ -605,3 +720,76 @@ fn main() {

    c.final_summary();
 }
+#[cfg(feature = "hpu")]
+fn main() {
+    let cks = {
+        // Hpu is enable, start benchmark on Hpu hw accelerator
+        use tfhe::tfhe_hpu_backend::prelude::*;
+        use tfhe::Config;
+
+        // Use environment variable to construct path to configuration file
+        let config_path = ShellString::new(
+            "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml".to_string(),
+        );
+        let hpu_device = HpuDevice::from_config(&config_path.expand());
+
+        let config = Config::from_hpu_device(&hpu_device);
+        let cks = ClientKey::generate(config);
+        let compressed_sks = CompressedServerKey::new(&cks);
+
+        set_server_key((hpu_device, compressed_sks));
+        cks
+    };
+
+    let mut c = Criterion::default().sample_size(10).configure_from_args();
+
+    let bench_name = "hlapi::hpu::erc20::transfer";
+
+    // FheUint64 latency
+    {
+        let mut group = c.benchmark_group(bench_name);
+        bench_transfer_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "whitepaper",
+            transfer_whitepaper::<FheUint64>,
+        );
+        // Erc20 optimized instruction only available on Hpu
+        bench_transfer_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "hpu_optim",
+            transfer_hpu::<FheUint64>,
+        );
+        group.finish();
+    }
+
+    // FheUint64 Throughput
+    {
+        let mut group = c.benchmark_group(bench_name);
+        hpu_bench_transfer_throughput(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "whitepaper",
+            transfer_whitepaper::<FheUint64>,
+        );
+        // Erc20 optimized instruction only available on Hpu
+        hpu_bench_transfer_throughput(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "hpu_optim",
+            transfer_hpu::<FheUint64>,
+        );
+        group.finish();
+    }
+
+    c.final_summary();
+}
--- a/tfhe-benchmark/benches/integer/bench.rs
+++ b/tfhe-benchmark/benches/integer/bench.rs
@@ -2931,6 +2931,323 @@ use cuda::{
    unchecked_cuda_ops, unchecked_scalar_cuda_ops,
 };

+#[cfg(feature = "hpu")]
+mod hpu {
+    use super::*;
+    use criterion::{black_box, criterion_group};
+    use tfhe::integer::hpu::ciphertext::HpuRadixCiphertext;
+    use tfhe::prelude::CastFrom;
+    use tfhe::tfhe_hpu_backend::prelude::*;
+
+    /// Base function to bench an hpu operations.
+    /// Inputs/Output types and length are inferred based on associated iop prototype
+    fn bench_hpu_iop_clean_inputs(
+        c: &mut Criterion,
+        bench_name: &str,
+        display_name: &str,
+        iop: &hpu_asm::AsmIOpcode,
+    ) {
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(60));
+        let mut rng = rand::thread_rng();
+
+        for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
+            if bit_size > ScalarType::BITS as usize {
+                break;
+            }
+            let param_name = param.name();
+
+            let max_value_for_bit_size = ScalarType::MAX >> (ScalarType::BITS as usize - bit_size);
+
+            let bench_id;
+
+            let proto = if let Some(format) = iop.format() {
+                format.proto.clone()
+            } else {
+                panic!("HPU only IOp with defined prototype could be benched");
+            };
+
+            match get_bench_type() {
+                BenchmarkType::Latency => {
+                    bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                    bench_group.bench_function(&bench_id, |b| {
+                        let (cks, _sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                        let hpu_device_mutex = KEY_CACHE.get_hpu_device(param);
+                        let hpu_device = hpu_device_mutex.lock().unwrap();
+
+                        let gen_inputs = || {
+                            let srcs = proto
+                                .src
+                                .iter()
+                                .map(|mode| {
+                                    let (bw, block) = match mode {
+                                        hpu_asm::iop::VarMode::Native => (bit_size, num_block),
+                                        hpu_asm::iop::VarMode::Half => {
+                                            (bit_size / 2, num_block / 2)
+                                        }
+                                        hpu_asm::iop::VarMode::Bool => (1, 1),
+                                    };
+
+                                    let clear = rng
+                                        .gen_range(0..u128::cast_from(max_value_for_bit_size))
+                                        & if bw < u128::BITS as usize {
+                                            (1_u128 << bw) - 1
+                                        } else {
+                                            !0_u128
+                                        };
+                                    let fhe = cks.encrypt_radix(clear, block);
+                                    HpuRadixCiphertext::from_radix_ciphertext(&fhe, &hpu_device)
+                                })
+                                .collect::<Vec<_>>();
+
+                            let imms = (0..proto.imm)
+                                .map(|_| rng.gen_range(0..u128::cast_from(max_value_for_bit_size)))
+                                .collect::<Vec<_>>();
+                            (srcs, imms)
+                        };
+
+                        b.iter_batched(
+                            gen_inputs,
+                            |(srcs, imms)| {
+                                let res =
+                                    HpuRadixCiphertext::exec(&proto, iop.opcode(), &srcs, &imms);
+                                res.into_iter().for_each(|ct| {
+                                    ct.wait();
+                                    black_box(ct);
+                                });
+                            },
+                            criterion::BatchSize::SmallInput,
+                        )
+                    });
+                }
+                BenchmarkType::Throughput => {
+                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                    bench_group
+                        .sample_size(10)
+                        .measurement_time(std::time::Duration::from_secs(30));
+                    let elements = throughput_num_threads(num_block, 1);
+                    bench_group.throughput(Throughput::Elements(elements));
+                    bench_group.bench_function(&bench_id, |b| {
+                        let (cks, _sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                        let hpu_device_mutex = KEY_CACHE.get_hpu_device(param);
+                        let hpu_device = hpu_device_mutex.lock().unwrap();
+
+                        let inputs = (0..elements)
+                            .map(|_| {
+                                let srcs = proto
+                                    .src
+                                    .iter()
+                                    .map(|mode| {
+                                        let (bw, block) = match mode {
+                                            hpu_asm::iop::VarMode::Native => (bit_size, num_block),
+                                            hpu_asm::iop::VarMode::Half => {
+                                                (bit_size / 2, num_block / 2)
+                                            }
+                                            hpu_asm::iop::VarMode::Bool => (1, 1),
+                                        };
+
+                                        let clear = rng
+                                            .gen_range(0..u128::cast_from(max_value_for_bit_size))
+                                            & if bw < u128::BITS as usize {
+                                                (1_u128 << bw) - 1
+                                            } else {
+                                                !0_u128
+                                            };
+                                        let fhe = cks.encrypt_radix(clear, block);
+                                        HpuRadixCiphertext::from_radix_ciphertext(&fhe, &hpu_device)
+                                    })
+                                    .collect::<Vec<_>>();
+
+                                let imms = (0..proto.imm)
+                                    .map(|_| {
+                                        rng.gen_range(0..u128::cast_from(max_value_for_bit_size))
+                                    })
+                                    .collect::<Vec<_>>();
+                                (srcs, imms)
+                            })
+                            .collect::<Vec<_>>();
+
+                        b.iter(|| {
+                            let last_res = inputs
+                                .iter()
+                                .map(|input| {
+                                    HpuRadixCiphertext::exec(
+                                        &proto,
+                                        iop.opcode(),
+                                        &input.0,
+                                        &input.1,
+                                    )
+                                })
+                                .next_back()
+                                .unwrap();
+                            last_res.into_iter().for_each(|ct| {
+                                ct.wait();
+                                black_box(ct);
+                            });
+                        })
+                    });
+                }
+            }
+
+            write_to_json::<u64, _>(
+                &bench_id,
+                param,
+                param.name(),
+                display_name,
+                &OperatorType::Atomic,
+                bit_size as u32,
+                vec![param.message_modulus().0.ilog2(); num_block],
+            );
+        }
+
+        bench_group.finish()
+    }
+
+    macro_rules! define_hpu_bench_default_fn (
+    (iop_name: $iop:ident, display_name:$name:ident) => {
+        ::paste::paste!{
+        fn [< default_hpu_ $iop:lower >](c: &mut Criterion) {
+            bench_hpu_iop_clean_inputs(
+                c,
+                concat!("integer::hpu::", stringify!($iop)),
+                stringify!($name),
+                &hpu_asm::iop::[< IOP_ $iop:upper >],
+            )
+        }
+        }
+    }
+    );
+
+    macro_rules! define_hpu_bench_default_fn_scalar (
+    (iop_name: $iop:ident, display_name:$name:ident) => {
+        ::paste::paste!{
+        fn [< default_hpu_ $iop:lower >](c: &mut Criterion) {
+            bench_hpu_iop_clean_inputs(
+                c,
+                concat!("integer::hpu::scalar::", stringify!($iop)),
+                stringify!($name),
+                &hpu_asm::iop::[< IOP_ $iop:upper >],
+            )
+        }
+        }
+    }
+    );
+
+    // Alu ------------------------------------------------------------------------
+    define_hpu_bench_default_fn!(
+        iop_name: add,
+        display_name: add
+    );
+    define_hpu_bench_default_fn!(
+        iop_name: sub,
+        display_name: sub
+    );
+    define_hpu_bench_default_fn!(
+        iop_name: mul,
+        display_name: mul
+    );
+    criterion_group!(
+        default_hpu_ops,
+        default_hpu_add,
+        default_hpu_sub,
+        default_hpu_mul
+    );
+
+    // Alu Scalar -----------------------------------------------------------------
+    define_hpu_bench_default_fn_scalar!(
+        iop_name: adds,
+        display_name: add
+    );
+    define_hpu_bench_default_fn_scalar!(
+        iop_name: subs,
+        display_name: sub
+    );
+    //define_hpu_bench_default_fn!(
+    //    iop_name: ssub,
+    //    display_name: scalar_sub
+    //);
+    define_hpu_bench_default_fn_scalar!(
+        iop_name: muls,
+        display_name: mul
+    );
+    criterion_group!(
+        default_hpu_ops_scalar,
+        default_hpu_adds,
+        default_hpu_subs,
+        //default_hpu_ssub,
+        default_hpu_muls
+    );
+    // Bitwise --------------------------------------------------------------------
+    define_hpu_bench_default_fn!(
+        iop_name: bw_and,
+        display_name: bitand
+    );
+    define_hpu_bench_default_fn!(
+        iop_name: bw_or,
+        display_name: bitor
+    );
+    define_hpu_bench_default_fn!(
+        iop_name: bw_xor,
+        display_name: bitxor
+    );
+    criterion_group!(
+        default_hpu_bitwise,
+        default_hpu_bw_and,
+        default_hpu_bw_or,
+        default_hpu_bw_xor,
+    );
+    // Comparison ----------------------------------------------------------------
+    define_hpu_bench_default_fn!(
+        iop_name: cmp_eq,
+        display_name: equal
+    );
+    define_hpu_bench_default_fn!(
+        iop_name: cmp_neq,
+        display_name: not_equal
+    );
+    define_hpu_bench_default_fn!(
+        iop_name: cmp_gt,
+        display_name: greater_than
+    );
+    define_hpu_bench_default_fn!(
+        iop_name: cmp_gte,
+        display_name: greater_or_equal
+    );
+    define_hpu_bench_default_fn!(
+        iop_name: cmp_lt,
+        display_name: lower_than
+    );
+    define_hpu_bench_default_fn!(
+        iop_name: cmp_lte,
+        display_name: lower_or_equal
+    );
+    criterion_group!(
+        default_hpu_cmp,
+        default_hpu_cmp_eq,
+        default_hpu_cmp_neq,
+        default_hpu_cmp_gt,
+        default_hpu_cmp_gte,
+        default_hpu_cmp_lt,
+        default_hpu_cmp_lte,
+    );
+    // Ternary --------------------------------------------------------------------
+    define_hpu_bench_default_fn!(
+        iop_name: if_then_else,
+        display_name: if_then_else
+    );
+    define_hpu_bench_default_fn!(
+        iop_name: if_then_zero,
+        display_name: if_then_zero
+    );
+    criterion_group!(
+        default_hpu_select,
+        default_hpu_if_then_else,
+        default_hpu_if_then_zero,
+    );
+}
+
 criterion_group!(
    smart_ops,
    smart_neg,
@@ -3297,6 +3614,23 @@ fn go_through_gpu_bench_groups(val: &str) {
    };
 }

+#[cfg(feature = "hpu")]
+fn go_through_hpu_bench_groups(val: &str) {
+    match val.to_lowercase().as_str() {
+        "default" => {
+            hpu::default_hpu_ops();
+            hpu::default_hpu_ops_scalar();
+            hpu::default_hpu_bitwise();
+            hpu::default_hpu_cmp();
+            hpu::default_hpu_select();
+        }
+        "fast_default" => {
+            hpu::default_hpu_ops();
+        }
+        _ => panic!("unknown benchmark operations flavor"),
+    };
+}
+
 fn go_through_cpu_bench_groups(val: &str) {
    match val.to_lowercase().as_str() {
        "default" => {
@@ -3336,7 +3670,9 @@ fn main() {
        Ok(val) => {
            #[cfg(feature = "gpu")]
            go_through_gpu_bench_groups(&val);
-            #[cfg(not(feature = "gpu"))]
+            #[cfg(feature = "hpu")]
+            go_through_hpu_bench_groups(&val);
+            #[cfg(not(any(feature = "gpu", feature = "hpu")))]
            go_through_cpu_bench_groups(&val);
        }
        Err(_) => {
--- a/tfhe-benchmark/src/params.rs
+++ b/tfhe-benchmark/src/params.rs
@@ -33,7 +33,8 @@ pub mod shortint_params {
    use tfhe::core_crypto::prelude::{DynamicDistribution, LweBskGroupingFactor};
    use tfhe::keycache::NamedParam;
    use tfhe::shortint::{
-        CarryModulus, ClassicPBSParameters, MessageModulus, MultiBitPBSParameters, PBSParameters,
+        AtomicPatternParameters, CarryModulus, ClassicPBSParameters, MessageModulus,
+        MultiBitPBSParameters,
    };

    pub const SHORTINT_BENCH_PARAMS_TUNIFORM: [ClassicPBSParameters; 4] = [
@@ -78,7 +79,7 @@ pub mod shortint_params {
                .map(|params| {
                    (
                        params.name(),
-                        <ClassicPBSParameters as Into<PBSParameters>>::into(*params)
+                        <ClassicPBSParameters as Into<AtomicPatternParameters>>::into(*params)
                            .to_owned()
                            .into(),
                    )
@@ -94,7 +95,7 @@ pub mod shortint_params {
                .map(|(params, name)| {
                    (
                        name.to_string(),
-                        <ClassicPBSParameters as Into<PBSParameters>>::into(*params)
+                        <ClassicPBSParameters as Into<AtomicPatternParameters>>::into(*params)
                            .to_owned()
                            .into(),
                    )
@@ -111,7 +112,7 @@ pub mod shortint_params {
                .map(|params| {
                    (
                        params.name(),
-                        <MultiBitPBSParameters as Into<PBSParameters>>::into(*params)
+                        <MultiBitPBSParameters as Into<AtomicPatternParameters>>::into(*params)
                            .to_owned()
                            .into(),
                    )
@@ -132,7 +133,7 @@ pub mod shortint_params {
                .map(|(params, name)| {
                    (
                        name.to_string(),
-                        <MultiBitPBSParameters as Into<PBSParameters>>::into(*params)
+                        <MultiBitPBSParameters as Into<AtomicPatternParameters>>::into(*params)
                            .to_owned()
                            .into(),
                    )
@@ -150,7 +151,7 @@ pub mod shortint_params {
                .map(|params| {
                    (
                        params.name(),
-                        <MultiBitPBSParameters as Into<PBSParameters>>::into(*params)
+                        <MultiBitPBSParameters as Into<AtomicPatternParameters>>::into(*params)
                            .to_owned()
                            .into(),
                        params.grouping_factor,
@@ -172,7 +173,7 @@ pub mod shortint_params {
                .map(|(params, name)| {
                    (
                        name.to_string(),
-                        <MultiBitPBSParameters as Into<PBSParameters>>::into(*params)
+                        <MultiBitPBSParameters as Into<AtomicPatternParameters>>::into(*params)
                            .to_owned()
                            .into(),
                        params.grouping_factor,
@@ -183,7 +184,7 @@ pub mod shortint_params {
        }
    }

-    pub fn raw_benchmark_parameters() -> Vec<PBSParameters> {
+    pub fn raw_benchmark_parameters() -> Vec<AtomicPatternParameters> {
        let is_multi_bit = match env::var("__TFHE_RS_PARAM_TYPE") {
            Ok(val) => val.to_lowercase() == "multi_bit",
            Err(_) => false,
@@ -351,7 +352,7 @@ pub mod shortint_params {
        }
    }

-    pub fn filter_parameters<'a, P: Copy + Into<PBSParameters>>(
+    pub fn filter_parameters<'a, P: Copy + Into<AtomicPatternParameters>>(
        params: &[(&'a P, &'a str)],
        desired_noise_distribution: DesiredNoiseDistribution,
        desired_backend: DesiredBackend,
@@ -359,7 +360,7 @@ pub mod shortint_params {
        params
            .iter()
            .filter_map(|(p, name)| {
-                let temp_param: PBSParameters = (**p).into();
+                let temp_param: AtomicPatternParameters = (**p).into();

                match (
                    temp_param.lwe_noise_distribution(),
@@ -391,13 +392,14 @@ mod integer_params {
    use crate::utilities::EnvConfig;
    use itertools::iproduct;
    use std::vec::IntoIter;
-    use tfhe::shortint::PBSParameters;
+    use tfhe::shortint::AtomicPatternParameters;

    /// An iterator that yields a succession of combinations
    /// of parameters and a num_block to achieve a certain bit_size ciphertext
    /// in radix decomposition
    pub struct ParamsAndNumBlocksIter {
-        params_and_bit_sizes: itertools::Product<IntoIter<PBSParameters>, IntoIter<usize>>,
+        params_and_bit_sizes:
+            itertools::Product<IntoIter<AtomicPatternParameters>, IntoIter<usize>>,
    }

    impl Default for ParamsAndNumBlocksIter {
@@ -405,23 +407,33 @@ mod integer_params {
            let env_config = EnvConfig::new();

            if env_config.is_multi_bit {
-                #[cfg(feature = "gpu")]
-                let params = vec![
-                    BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
-                        .into(),
-                ];
-                #[cfg(not(feature = "gpu"))]
-                let params = vec![
-                    BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128.into(),
-                ];
+                #[cfg(feature = "hpu")]
+                panic!("Hpu doesn't implement MultiBit");

-                let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
-                Self {
-                    params_and_bit_sizes,
+                #[cfg(not(feature = "hpu"))]
+                {
+                    #[cfg(feature = "gpu")]
+                    let params = vec![
+                        BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
+                            .into(),
+                    ];
+                    #[cfg(not(feature = "gpu"))]
+                    let params = vec![
+                        BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128
+                            .into(),
+                    ];
+
+                    let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
+                    Self {
+                        params_and_bit_sizes,
+                    }
                }
            } else {
                // FIXME One set of parameter is tested since we want to benchmark only quickest
                // operations.
+                #[cfg(feature = "hpu")]
+                let params = vec![BENCH_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M64.into()];
+                #[cfg(not(feature = "hpu"))]
                let params = vec![BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into()];

                let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
@@ -433,7 +445,7 @@ mod integer_params {
    }

    impl Iterator for ParamsAndNumBlocksIter {
-        type Item = (PBSParameters, usize, usize);
+        type Item = (AtomicPatternParameters, usize, usize);

        fn next(&mut self) -> Option<Self::Item> {
            let (param, bit_size) = self.params_and_bit_sizes.next()?;
--- a/tfhe-benchmark/src/params_aliases.rs
+++ b/tfhe-benchmark/src/params_aliases.rs
@@ -1,6 +1,8 @@
 #[cfg(any(feature = "shortint", feature = "integer"))]
 pub mod shortint_params_aliases {
    use tfhe::shortint::parameters::current_params::*;
+    #[cfg(feature = "hpu")]
+    use tfhe::shortint::parameters::KeySwitch32PBSParameters;
    use tfhe::shortint::parameters::{
        ClassicPBSParameters, CompactPublicKeyEncryptionParameters, CompressionParameters,
        MultiBitPBSParameters, NoiseSquashingParameters, ShortintKeySwitchingParameters,
@@ -136,6 +138,15 @@ pub mod shortint_params_aliases {
    pub const BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
        NoiseSquashingParameters =
        V1_2_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    #[cfg(feature = "hpu")]
+    // KS PBS Gaussian for Hpu
+    pub const BENCH_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_GAUSSIAN_2M64: KeySwitch32PBSParameters =
+        V1_2_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_GAUSSIAN_2M64;
+    #[cfg(feature = "hpu")]
+    // KS PBS TUniform
+    pub const BENCH_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M64: KeySwitch32PBSParameters =
+        V1_2_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M64;
 }

 #[cfg(any(feature = "shortint", feature = "integer"))]
--- a/tfhe-benchmark/src/utilities.rs
+++ b/tfhe-benchmark/src/utilities.rs
@@ -312,6 +312,7 @@ pub fn write_to_json<

 const FAST_BENCH_BIT_SIZES: [usize; 1] = [64];
 const BENCH_BIT_SIZES: [usize; 8] = [4, 8, 16, 32, 40, 64, 128, 256];
+const HPU_BENCH_BIT_SIZES: [usize; 5] = [8, 16, 32, 64, 128];
 const MULTI_BIT_CPU_SIZES: [usize; 6] = [4, 8, 16, 32, 40, 64];

 /// User configuration in which benchmarks must be run.
@@ -349,6 +350,8 @@ impl EnvConfig {
            } else {
                MULTI_BIT_CPU_SIZES.to_vec()
            }
+        } else if cfg!(feature = "hpu") {
+            HPU_BENCH_BIT_SIZES.to_vec()
        } else {
            BENCH_BIT_SIZES.to_vec()
        }
@@ -397,7 +400,15 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
        elements.min(1500) // This threshold is useful for operation with both a small number of
                           // block and low PBs count.
    }
-    #[cfg(not(feature = "gpu"))]
+    #[cfg(feature = "hpu")]
+    {
+        // NB: unused with HPU
+        let _ = minimum_loading;
+        let _ = op_pbs_count;
+        // Enforce that a minimum of 64 IOp is sent
+        block_multiplicator.min(64.0) as u64
+    }
+    #[cfg(not(any(feature = "gpu", feature = "hpu")))]
    {
        let num_threads = rayon::current_num_threads() as f64;
        let operation_loading = (num_threads / (op_pbs_count as f64)).max(minimum_loading);