feat(hpu,mockup) Add a flag nops for fast simulation

Bypass Tfhe operation for fast simulation. This obviously break the behavior but kept accurate performance estimation. For accurate behavior with fast runtime, use `fast` parameters set. NB: This kept correct behavior but break performance estimation. Not perfect solution, but should mitigate our runtime issue, until proper computation over trivial ciphertext is supported.
2026-01-10 07:08:03 -05:00 · 2025-03-18 09:15:45 +01:00
parent ed6f74b468
commit ce3208f74c
3 changed files with 240 additions and 186 deletions
--- a/mockups/tfhe-hpu-mockup/src/lib.rs
+++ b/mockups/tfhe-hpu-mockup/src/lib.rs
@@ -328,191 +328,7 @@ impl HpuSim {

        // Read operands
        match dop {
-            hpu_asm::DOp::LD(op_impl) => {
-                let dst = &mut self.regfile[op_impl.0.rid.0 as usize];
-                let cid_ofst = match op_impl.0.slot {
-                    hpu_asm::MemId::Addr(ct_id) => ct_id.0 as usize,
-                    _ => panic!("Template must have been resolved before execution"),
-                };
-
-                // Ct_ofst is equal over PC
-                let ct_ofst = cid_ofst
-                    * page_align(
-                        hpu_big_lwe_ciphertext_size(&self.params.rtl_params)
-                            .div_ceil(self.params.rtl_params.pc_params.pem_pc)
-                            * std::mem::size_of::<u64>(),
-                    );
-                let ct_chunk = self
-                    .config
-                    .board
-                    .ct_pc
-                    .iter()
-                    .enumerate()
-                    .map(|(id, mem_kind)| {
-                        let ldst_ofst = {
-                            let (msb, lsb) = self.regmap.addr_offset().ldst[id];
-                            ((msb as u64) << 32) + lsb as u64
-                        };
-                        match mem_kind {
-                            MemKind::Ddr { .. } => {
-                                self.ddr.get_chunk(ldst_ofst + ct_ofst as u64).data()
-                            }
-                            MemKind::Hbm { pc } => self.hbm_bank[*pc]
-                                .get_chunk(ldst_ofst + ct_ofst as u64)
-                                .data(),
-                        }
-                        // self.hbm_bank[*pc].get_chunk(ldst_ofst + ct_ofst as u64)
-                    })
-                    .collect::<Vec<_>>();
-
-                let hw_slice = dst.as_mut_view().into_container();
-                std::iter::zip(hw_slice, ct_chunk).for_each(|(hpu, mem)| {
-                    // NB: Chunk are extended to enforce page align buffer
-                    // -> To prevent error during copy, with shrink the mem buffer to the
-                    // real   size before-hand
-                    let size_b = std::mem::size_of_val(hpu);
-                    let hbm_u64 = bytemuck::cast_slice::<u8, u64>(&mem[0..size_b]);
-                    hpu.clone_from_slice(hbm_u64);
-                });
-            }
-
-            hpu_asm::DOp::ST(op_impl) => {
-                let src = &self.regfile[op_impl.0.rid.0 as usize];
-                let cid_ofst = match op_impl.0.slot {
-                    hpu_asm::MemId::Addr(ct_id) => ct_id.0 as usize,
-                    _ => panic!("Template must have been resolved before execution"),
-                };
-
-                // Ct_ofst is equal over PC
-                let ct_ofst = cid_ofst
-                    * page_align(
-                        hpu_big_lwe_ciphertext_size(&self.params.rtl_params)
-                            .div_ceil(self.params.rtl_params.pc_params.pem_pc)
-                            * std::mem::size_of::<u64>(),
-                    );
-                src.as_view()
-                    .into_container()
-                    .into_iter()
-                    .enumerate()
-                    .for_each(|(id, hpu)| {
-                        let ldst_ofst = {
-                            let (msb, lsb) = self.regmap.addr_offset().ldst[id];
-                            ((msb as u64) << 32) + lsb as u64
-                        };
-                        let ct_chunk_mut_view = match self.config.board.ct_pc[id] {
-                            MemKind::Ddr { .. } => self
-                                .ddr
-                                .get_mut_chunk(ldst_ofst + ct_ofst as u64)
-                                .data_mut(),
-                            MemKind::Hbm { pc } => self.hbm_bank[pc]
-                                .get_mut_chunk(ldst_ofst + ct_ofst as u64)
-                                .data_mut(),
-                        };
-                        // NB: hbm chunk are extended to enforce page align buffer
-                        // -> Shrinked it to slice size to prevent error during copy
-                        let size_b = std::mem::size_of_val(hpu);
-
-                        let ct_chunk_u64 =
-                            bytemuck::cast_slice_mut::<u8, u64>(&mut ct_chunk_mut_view[0..size_b]);
-                        ct_chunk_u64.copy_from_slice(hpu);
-                    });
-            }
-
-            hpu_asm::DOp::ADD(op_impl) => {
-                // NB: The first src is used as destination to prevent useless allocation
-                let mut cpu_s0 = self.reg2cpu(op_impl.0.src0_rid);
-                let cpu_s1 = self.reg2cpu(op_impl.0.src1_rid);
-                lwe_ciphertext_add_assign(&mut cpu_s0, &cpu_s1);
-                self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view());
-            }
-            hpu_asm::DOp::SUB(op_impl) => {
-                // NB: The first src is used as destination to prevent useless allocation
-                let mut cpu_s0 = self.reg2cpu(op_impl.0.src0_rid);
-                let cpu_s1 = self.reg2cpu(op_impl.0.src1_rid);
-                lwe_ciphertext_sub_assign(&mut cpu_s0, &cpu_s1);
-                self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view());
-            }
-            hpu_asm::DOp::MAC(op_impl) => {
-                // NB: Srcs are used as destination to prevent useless allocation
-                let mut cpu_s0 = self.reg2cpu(op_impl.0.src0_rid);
-                let cpu_s1 = self.reg2cpu(op_impl.0.src1_rid);
-
-                lwe_ciphertext_cleartext_mul_assign(
-                    &mut cpu_s0,
-                    Cleartext(op_impl.0.mul_factor.0 as u64),
-                );
-                lwe_ciphertext_add_assign(&mut cpu_s0, &cpu_s1);
-
-                self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view());
-            }
-            hpu_asm::DOp::ADDS(op_impl) => {
-                // NB: The first src is used as destination to prevent useless allocation
-                let mut cpu_s0 = self.reg2cpu(op_impl.0.src_rid);
-                let msg_cst = match op_impl.0.msg_cst {
-                    hpu_asm::ImmId::Cst(cst) => cst as u64,
-                    _ => panic!("Template must have been resolved before execution"),
-                };
-                let msg_encoded = msg_cst * self.params.rtl_params.pbs_params.delta();
-                lwe_ciphertext_plaintext_add_assign(&mut cpu_s0, Plaintext(msg_encoded));
-                self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view());
-            }
-            hpu_asm::DOp::SUBS(op_impl) => {
-                // NB: The first src is used as destination to prevent useless allocation
-                let mut cpu_s0 = self.reg2cpu(op_impl.0.src_rid);
-                let msg_cst = match op_impl.0.msg_cst {
-                    hpu_asm::ImmId::Cst(cst) => cst as u64,
-                    _ => panic!("Template must have been resolved before execution"),
-                };
-                let msg_encoded = msg_cst * self.params.rtl_params.pbs_params.delta();
-                lwe_ciphertext_plaintext_sub_assign(&mut cpu_s0, Plaintext(msg_encoded));
-                self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view());
-            }
-            hpu_asm::DOp::SSUB(op_impl) => {
-                // NB: The first src is used as destination to prevent useless allocation
-                let mut cpu_s0 = self.reg2cpu(op_impl.0.src_rid);
-                lwe_ciphertext_opposite_assign(&mut cpu_s0);
-                let msg_cst = match op_impl.0.msg_cst {
-                    hpu_asm::ImmId::Cst(cst) => cst as u64,
-                    _ => panic!("Template must have been resolved before execution"),
-                };
-                let msg_encoded = msg_cst * self.params.rtl_params.pbs_params.delta();
-                lwe_ciphertext_plaintext_add_assign(&mut cpu_s0, Plaintext(msg_encoded));
-                self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view());
-            }
-            hpu_asm::DOp::MULS(op_impl) => {
-                // NB: The first src is used as destination to prevent useless allocation
-                let mut cpu_s0 = self.reg2cpu(op_impl.0.src_rid);
-                let msg_cst = match op_impl.0.msg_cst {
-                    hpu_asm::ImmId::Cst(cst) => cst as u64,
-                    _ => panic!("Template must have been resolved before execution"),
-                };
-                lwe_ciphertext_cleartext_mul_assign(&mut cpu_s0, Cleartext(msg_cst));
-                self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view());
-            }
-            hpu_asm::DOp::PBS(op_impl) => {
-                self.apply_pbs2reg(1, op_impl.0.dst_rid, op_impl.0.src_rid, op_impl.0.gid)
-            }
-            hpu_asm::DOp::PBS_ML2(op_impl) => {
-                self.apply_pbs2reg(2, op_impl.0.dst_rid, op_impl.0.src_rid, op_impl.0.gid)
-            }
-            hpu_asm::DOp::PBS_ML4(op_impl) => {
-                self.apply_pbs2reg(4, op_impl.0.dst_rid, op_impl.0.src_rid, op_impl.0.gid)
-            }
-            hpu_asm::DOp::PBS_ML8(op_impl) => {
-                self.apply_pbs2reg(8, op_impl.0.dst_rid, op_impl.0.src_rid, op_impl.0.gid)
-            }
-            hpu_asm::DOp::PBS_F(op_impl) => {
-                self.apply_pbs2reg(1, op_impl.0.dst_rid, op_impl.0.src_rid, op_impl.0.gid)
-            }
-            hpu_asm::DOp::PBS_ML2_F(op_impl) => {
-                self.apply_pbs2reg(2, op_impl.0.dst_rid, op_impl.0.src_rid, op_impl.0.gid)
-            }
-            hpu_asm::DOp::PBS_ML4_F(op_impl) => {
-                self.apply_pbs2reg(4, op_impl.0.dst_rid, op_impl.0.src_rid, op_impl.0.gid)
-            }
-            hpu_asm::DOp::PBS_ML8_F(op_impl) => {
-                self.apply_pbs2reg(8, op_impl.0.dst_rid, op_impl.0.src_rid, op_impl.0.gid)
-            }
+            // Invariant path, handle independantly from `nops` flag
            hpu_asm::DOp::SYNC(_) => {
                // Push ack in stream
                let iop = self
@@ -564,10 +380,239 @@ impl HpuSim {
                    writeln!(trace_file, "{}", json_string).unwrap();
                }
            }
+            // Skipped with `nops` flag
+            _ => {
+                if !self.options.nops {
+                    match dop {
+                        hpu_asm::DOp::LD(op_impl) => {
+                            let dst = &mut self.regfile[op_impl.0.rid.0 as usize];
+                            let cid_ofst = match op_impl.0.slot {
+                                hpu_asm::MemId::Addr(ct_id) => ct_id.0 as usize,
+                                _ => panic!("Template must have been resolved before execution"),
+                            };
+
+                            // Ct_ofst is equal over PC
+                            let ct_ofst = cid_ofst
+                                * page_align(
+                                    hpu_big_lwe_ciphertext_size(&self.params.rtl_params)
+                                        .div_ceil(self.params.rtl_params.pc_params.pem_pc)
+                                        * std::mem::size_of::<u64>(),
+                                );
+                            let ct_chunk = self
+                                .config
+                                .board
+                                .ct_pc
+                                .iter()
+                                .enumerate()
+                                .map(|(id, mem_kind)| {
+                                    let ldst_ofst = {
+                                        let (msb, lsb) = self.regmap.addr_offset().ldst[id];
+                                        ((msb as u64) << 32) + lsb as u64
+                                    };
+                                    match mem_kind {
+                                        MemKind::Ddr { .. } => {
+                                            self.ddr.get_chunk(ldst_ofst + ct_ofst as u64).data()
+                                        }
+                                        MemKind::Hbm { pc } => self.hbm_bank[*pc]
+                                            .get_chunk(ldst_ofst + ct_ofst as u64)
+                                            .data(),
+                                    }
+                                    // self.hbm_bank[*pc].get_chunk(ldst_ofst + ct_ofst as u64)
+                                })
+                                .collect::<Vec<_>>();
+
+                            let hw_slice = dst.as_mut_view().into_container();
+                            std::iter::zip(hw_slice, ct_chunk).for_each(|(hpu, mem)| {
+                                // NB: Chunk are extended to enforce page align buffer
+                                // -> To prevent error during copy, with shrink the mem buffer to the
+                                // real   size before-hand
+                                let size_b = std::mem::size_of_val(hpu);
+                                let hbm_u64 = bytemuck::cast_slice::<u8, u64>(&mem[0..size_b]);
+                                hpu.clone_from_slice(hbm_u64);
+                            });
+                        }
+
+                        hpu_asm::DOp::ST(op_impl) => {
+                            let src = &self.regfile[op_impl.0.rid.0 as usize];
+                            let cid_ofst = match op_impl.0.slot {
+                                hpu_asm::MemId::Addr(ct_id) => ct_id.0 as usize,
+                                _ => panic!("Template must have been resolved before execution"),
+                            };
+
+                            // Ct_ofst is equal over PC
+                            let ct_ofst = cid_ofst
+                                * page_align(
+                                    hpu_big_lwe_ciphertext_size(&self.params.rtl_params)
+                                        .div_ceil(self.params.rtl_params.pc_params.pem_pc)
+                                        * std::mem::size_of::<u64>(),
+                                );
+                            src.as_view()
+                                .into_container()
+                                .into_iter()
+                                .enumerate()
+                                .for_each(|(id, hpu)| {
+                                    let ldst_ofst = {
+                                        let (msb, lsb) = self.regmap.addr_offset().ldst[id];
+                                        ((msb as u64) << 32) + lsb as u64
+                                    };
+                                    let ct_chunk_mut_view = match self.config.board.ct_pc[id] {
+                                        MemKind::Ddr { .. } => self
+                                            .ddr
+                                            .get_mut_chunk(ldst_ofst + ct_ofst as u64)
+                                            .data_mut(),
+                                        MemKind::Hbm { pc } => self.hbm_bank[pc]
+                                            .get_mut_chunk(ldst_ofst + ct_ofst as u64)
+                                            .data_mut(),
+                                    };
+                                    // NB: hbm chunk are extended to enforce page align buffer
+                                    // -> Shrinked it to slice size to prevent error during copy
+                                    let size_b = std::mem::size_of_val(hpu);
+
+                                    let ct_chunk_u64 = bytemuck::cast_slice_mut::<u8, u64>(
+                                        &mut ct_chunk_mut_view[0..size_b],
+                                    );
+                                    ct_chunk_u64.copy_from_slice(hpu);
+                                });
+                        }
+
+                        hpu_asm::DOp::ADD(op_impl) => {
+                            // NB: The first src is used as destination to prevent useless allocation
+                            let mut cpu_s0 = self.reg2cpu(op_impl.0.src0_rid);
+                            let cpu_s1 = self.reg2cpu(op_impl.0.src1_rid);
+                            lwe_ciphertext_add_assign(&mut cpu_s0, &cpu_s1);
+                            self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view());
+                        }
+                        hpu_asm::DOp::SUB(op_impl) => {
+                            // NB: The first src is used as destination to prevent useless allocation
+                            let mut cpu_s0 = self.reg2cpu(op_impl.0.src0_rid);
+                            let cpu_s1 = self.reg2cpu(op_impl.0.src1_rid);
+                            lwe_ciphertext_sub_assign(&mut cpu_s0, &cpu_s1);
+                            self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view());
+                        }
+                        hpu_asm::DOp::MAC(op_impl) => {
+                            // NB: Srcs are used as destination to prevent useless allocation
+                            let mut cpu_s0 = self.reg2cpu(op_impl.0.src0_rid);
+                            let cpu_s1 = self.reg2cpu(op_impl.0.src1_rid);
+
+                            lwe_ciphertext_cleartext_mul_assign(
+                                &mut cpu_s0,
+                                Cleartext(op_impl.0.mul_factor.0 as u64),
+                            );
+                            lwe_ciphertext_add_assign(&mut cpu_s0, &cpu_s1);
+
+                            self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view());
+                        }
+                        hpu_asm::DOp::ADDS(op_impl) => {
+                            // NB: The first src is used as destination to prevent useless allocation
+                            let mut cpu_s0 = self.reg2cpu(op_impl.0.src_rid);
+                            let msg_cst = match op_impl.0.msg_cst {
+                                hpu_asm::ImmId::Cst(cst) => cst as u64,
+                                _ => panic!("Template must have been resolved before execution"),
+                            };
+                            let msg_encoded = msg_cst * self.params.rtl_params.pbs_params.delta();
+                            lwe_ciphertext_plaintext_add_assign(
+                                &mut cpu_s0,
+                                Plaintext(msg_encoded),
+                            );
+                            self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view());
+                        }
+                        hpu_asm::DOp::SUBS(op_impl) => {
+                            // NB: The first src is used as destination to prevent useless allocation
+                            let mut cpu_s0 = self.reg2cpu(op_impl.0.src_rid);
+                            let msg_cst = match op_impl.0.msg_cst {
+                                hpu_asm::ImmId::Cst(cst) => cst as u64,
+                                _ => panic!("Template must have been resolved before execution"),
+                            };
+                            let msg_encoded = msg_cst * self.params.rtl_params.pbs_params.delta();
+                            lwe_ciphertext_plaintext_sub_assign(
+                                &mut cpu_s0,
+                                Plaintext(msg_encoded),
+                            );
+                            self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view());
+                        }
+                        hpu_asm::DOp::SSUB(op_impl) => {
+                            // NB: The first src is used as destination to prevent useless allocation
+                            let mut cpu_s0 = self.reg2cpu(op_impl.0.src_rid);
+                            lwe_ciphertext_opposite_assign(&mut cpu_s0);
+                            let msg_cst = match op_impl.0.msg_cst {
+                                hpu_asm::ImmId::Cst(cst) => cst as u64,
+                                _ => panic!("Template must have been resolved before execution"),
+                            };
+                            let msg_encoded = msg_cst * self.params.rtl_params.pbs_params.delta();
+                            lwe_ciphertext_plaintext_add_assign(
+                                &mut cpu_s0,
+                                Plaintext(msg_encoded),
+                            );
+                            self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view());
+                        }
+                        hpu_asm::DOp::MULS(op_impl) => {
+                            // NB: The first src is used as destination to prevent useless allocation
+                            let mut cpu_s0 = self.reg2cpu(op_impl.0.src_rid);
+                            let msg_cst = match op_impl.0.msg_cst {
+                                hpu_asm::ImmId::Cst(cst) => cst as u64,
+                                _ => panic!("Template must have been resolved before execution"),
+                            };
+                            lwe_ciphertext_cleartext_mul_assign(&mut cpu_s0, Cleartext(msg_cst));
+                            self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view());
+                        }
+                        hpu_asm::DOp::PBS(op_impl) => self.apply_pbs2reg(
+                            1,
+                            op_impl.0.dst_rid,
+                            op_impl.0.src_rid,
+                            op_impl.0.gid,
+                        ),
+                        hpu_asm::DOp::PBS_ML2(op_impl) => self.apply_pbs2reg(
+                            2,
+                            op_impl.0.dst_rid,
+                            op_impl.0.src_rid,
+                            op_impl.0.gid,
+                        ),
+                        hpu_asm::DOp::PBS_ML4(op_impl) => self.apply_pbs2reg(
+                            4,
+                            op_impl.0.dst_rid,
+                            op_impl.0.src_rid,
+                            op_impl.0.gid,
+                        ),
+                        hpu_asm::DOp::PBS_ML8(op_impl) => self.apply_pbs2reg(
+                            8,
+                            op_impl.0.dst_rid,
+                            op_impl.0.src_rid,
+                            op_impl.0.gid,
+                        ),
+                        hpu_asm::DOp::PBS_F(op_impl) => self.apply_pbs2reg(
+                            1,
+                            op_impl.0.dst_rid,
+                            op_impl.0.src_rid,
+                            op_impl.0.gid,
+                        ),
+                        hpu_asm::DOp::PBS_ML2_F(op_impl) => self.apply_pbs2reg(
+                            2,
+                            op_impl.0.dst_rid,
+                            op_impl.0.src_rid,
+                            op_impl.0.gid,
+                        ),
+                        hpu_asm::DOp::PBS_ML4_F(op_impl) => self.apply_pbs2reg(
+                            4,
+                            op_impl.0.dst_rid,
+                            op_impl.0.src_rid,
+                            op_impl.0.gid,
+                        ),
+                        hpu_asm::DOp::PBS_ML8_F(op_impl) => self.apply_pbs2reg(
+                            8,
+                            op_impl.0.dst_rid,
+                            op_impl.0.src_rid,
+                            op_impl.0.gid,
+                        ),
+                        _ => panic!("Error: {dop:?} must have been handled by invariant path"),
+                    }
+                }
+            }
        }

        // Dump operation src/dst in file if required
-        self.dump_op_reg(dop);
+        if !self.options.nops {
+            self.dump_op_reg(dop);
+        }

        // Increment program counter
        self.pc += 1;
--- a/mockups/tfhe-hpu-mockup/src/mockup.rs
+++ b/mockups/tfhe-hpu-mockup/src/mockup.rs
@@ -60,6 +60,13 @@ pub struct Args {
    #[clap(long, value_parser, default_value_t = 1_000_000)]
    quantum_us: usize,

+    /// Simulation bypass.
+    /// Disable execution, obviously led to incorrect behavior but accurate
+    /// performance estimation.
+    /// For correct behavior (but false perf estimation) use a "fast" parameter set
+    #[clap(long, value_parser)]
+    nops: bool,
+
    // Dump configuration ----------------------------------------------------
    // Use to activate some dump features for the generation of simulation stimulus
    /// Specify simulus dump folder.
@@ -96,6 +103,7 @@ impl From<&Args> for MockupOptions {
            dump_reg: args.dump_reg,
            report_out: args.report_out.clone(),
            report_trace: args.report_trace,
+            nops: args.nops,
        }
    }
 }
--- a/mockups/tfhe-hpu-mockup/src/modules/params.rs
+++ b/mockups/tfhe-hpu-mockup/src/modules/params.rs
@@ -16,6 +16,7 @@ pub struct MockupOptions {
    pub dump_reg: bool,
    pub report_out: Option<String>,
    pub report_trace: bool,
+    pub nops: bool,
 }

 impl MockupOptions {