diff --git a/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py b/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py index edc6eb411..a69d87899 100644 --- a/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py +++ b/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py @@ -31,14 +31,22 @@ class LD(BaseInstruction): self.__dict__ = d def args(self): - return f'R{self.rid} @{hex(self.slot["Addr"])}' + try: + return f'R{self.rid} @{hex(self.slot["Addr"])}' + except: + # It can happen that an IOP is not translated by the FW + return f'R{self.rid} @{self.slot}' class ST(BaseInstruction): def __init__(self, d): self.__dict__ = d def args(self): - return f'@{hex(self.slot["Addr"])} R{self.rid}' + try: + return f'@{hex(self.slot["Addr"])} R{self.rid}' + except: + # It can happen that an IOP is not translated by the FW + return f'@{self.slot} R{self.rid}' class MAC(BaseInstruction): def __init__(self, d): diff --git a/backends/tfhe-hpu-backend/src/asm/iop/mod.rs b/backends/tfhe-hpu-backend/src/asm/iop/mod.rs index 25e5f4d9a..8a3210f43 100644 --- a/backends/tfhe-hpu-backend/src/asm/iop/mod.rs +++ b/backends/tfhe-hpu-backend/src/asm/iop/mod.rs @@ -176,6 +176,18 @@ pub const IOP_2CT_F_CT_SCALAR: ConstIOpProto<2, 1> = ConstIOpProto { imm: 1, }; +pub const SIMD_N: usize = 12; //TODO: We need to come up with a way to have this dynamic +pub const IOP_NCT_F_2NCT: ConstIOpProto<{ 1 * SIMD_N }, { 2 * SIMD_N }> = ConstIOpProto { + dst: [VarMode::Native; 1 * SIMD_N], + src: [VarMode::Native; 2 * SIMD_N], + imm: 0, +}; +pub const IOP_2NCT_F_3NCT: ConstIOpProto<{ 2 * SIMD_N }, { 3 * SIMD_N }> = ConstIOpProto { + dst: [VarMode::Native; 2 * SIMD_N], + src: [VarMode::Native; 3 * SIMD_N], + imm: 0, +}; + use crate::iop; use arg::IOpFormat; use lazy_static::lazy_static; @@ -227,4 +239,6 @@ iop!( [IOP_CT_F_CT -> "LEAD1", opcode::LEAD1], [IOP_CT_F_CT -> "TRAIL0", opcode::TRAIL0], [IOP_CT_F_CT -> "TRAIL1", opcode::TRAIL1], + [IOP_NCT_F_2NCT -> "ADD_SIMD", opcode::ADD_SIMD], + [IOP_2NCT_F_3NCT -> "ERC_20_SIMD", opcode::ERC_20_SIMD], ); diff --git a/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs b/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs index 34cde3c0d..13da89569 100644 --- a/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs +++ b/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs @@ -87,6 +87,10 @@ pub const LEAD1: u8 = 0x85; pub const TRAIL0: u8 = 0x86; pub const TRAIL1: u8 = 0x87; +// SIMD for maximum throughput +pub const ADD_SIMD: u8 = 0xF0; +pub const ERC_20_SIMD: u8 = 0xF1; +// // Utility operations // Used to handle real clone of ciphertext already uploaded in the Hpu memory pub const MEMCPY: u8 = 0xFF; diff --git a/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs b/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs index e08576a2d..700ee37c5 100644 --- a/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs +++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs @@ -72,6 +72,9 @@ crate::impl_fw!("Ilp" [ LEAD1 => fw_impl::ilp_log::iop_lead1; TRAIL0 => fw_impl::ilp_log::iop_trail0; TRAIL1 => fw_impl::ilp_log::iop_trail1; + // SIMD Implementations + ADD_SIMD => fw_impl::llt::iop_add_simd; + ERC_20_SIMD => fw_impl::llt::iop_erc_20_simd; ]); #[instrument(level = "trace", skip(prog))] diff --git a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs index bd4df9dca..900adf063 100644 --- a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs +++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs @@ -57,16 +57,16 @@ crate::impl_fw!("Llt" [ OVF_SSUB => fw_impl::ilp::iop_overflow_ssub; OVF_MULS => fw_impl::ilp::iop_overflow_muls; - BW_AND => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwAnd::default().into())}); - BW_OR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwOr::default().into())}); - BW_XOR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwXor::default().into())}); + BW_AND => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwAnd::default().into())}); + BW_OR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwOr::default().into())}); + BW_XOR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwXor::default().into())}); - CMP_GT => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGtMrg"), pbs_by_name!("CmpGt"))}); - CMP_GTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGteMrg"), pbs_by_name!("CmpGte"))}); - CMP_LT => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLtMrg"), pbs_by_name!("CmpLt"))}); - CMP_LTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLteMrg"), pbs_by_name!("CmpLte"))}); - CMP_EQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpEqMrg"), pbs_by_name!("CmpEq"))}); - CMP_NEQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpNeqMrg"), pbs_by_name!("CmpNeq"))}); + CMP_GT => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGtMrg"), pbs_by_name!("CmpGt"))}); + CMP_GTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGteMrg"), pbs_by_name!("CmpGte"))}); + CMP_LT => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLtMrg"), pbs_by_name!("CmpLt"))}); + CMP_LTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLteMrg"), pbs_by_name!("CmpLte"))}); + CMP_EQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpEqMrg"), pbs_by_name!("CmpEq"))}); + CMP_NEQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpNeqMrg"), pbs_by_name!("CmpNeq"))}); IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero; IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else; @@ -81,6 +81,10 @@ crate::impl_fw!("Llt" [ LEAD1 => fw_impl::ilp_log::iop_lead1; TRAIL0 => fw_impl::ilp_log::iop_trail0; TRAIL1 => fw_impl::ilp_log::iop_trail1; + + // SIMD Implementations + ADD_SIMD => fw_impl::llt::iop_add_simd; + ERC_20_SIMD => fw_impl::llt::iop_erc_20_simd; ]); // ---------------------------------------------------------------------------- @@ -102,6 +106,17 @@ pub fn iop_add(prog: &mut Program) { iop_addx(prog, dst, src_a, src_b); } +#[instrument(level = "trace", skip(prog))] +pub fn iop_add_simd(prog: &mut Program) { + // Add Comment header + prog.push_comment("ADD_SIMD Operand::Dst Operand::Src Operand::Src".to_string()); + simd( + prog, + crate::asm::iop::SIMD_N, + fw_impl::llt::iop_add_ripple_rtl, + ); +} + pub fn iop_adds(prog: &mut Program) { // Allocate metavariables: // Dest -> Operand @@ -209,25 +224,46 @@ pub fn iop_muls(prog: &mut Program) { iop_mulx(prog, dst, src_a, src_b).add_to_prog(prog); } +#[instrument(level = "trace", skip(prog))] +pub fn iop_erc_20(prog: &mut Program) { + // Add Comment header + prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string()); + iop_erc_20_rtl(prog, 0).add_to_prog(prog); +} + +#[instrument(level = "trace", skip(prog))] +pub fn iop_erc_20_simd(prog: &mut Program) { + // Add Comment header + prog.push_comment("ERC_20_SIMD (new_from, new_to) <- (from, to, amount)".to_string()); + simd(prog, crate::asm::iop::SIMD_N, fw_impl::llt::iop_erc_20_rtl); +} + +// ---------------------------------------------------------------------------- +// Helper Functions +// ---------------------------------------------------------------------------- + /// Implement erc_20 fund xfer /// Targeted algorithm is as follow: /// 1. Check that from has enough funds /// 2. Compute real_amount to xfer (i.e. amount or 0) /// 3. Compute new amount (from - new_amount, to + new_amount) +/// +/// The input operands are: +/// (from[0], to[0], amount[0], ..., from[N-1], to[N-1], amount[N-1]) +/// The output operands are: +/// (dst_from[0], dst_to[0], ..., dst_from[N-1], dst_to[N-1]) +/// Where N is the batch size #[instrument(level = "trace", skip(prog))] -pub fn iop_erc_20(prog: &mut Program) { +pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl { // Allocate metavariables: // Dest -> Operand - let dst_from = prog.iop_template_var(OperandKind::Dst, 0); - let dst_to = prog.iop_template_var(OperandKind::Dst, 1); + let dst_from = prog.iop_template_var(OperandKind::Dst, 2 * batch_index); + let dst_to = prog.iop_template_var(OperandKind::Dst, 2 * batch_index + 1); // Src -> Operand - let src_from = prog.iop_template_var(OperandKind::Src, 0); - let src_to = prog.iop_template_var(OperandKind::Src, 1); + let src_from = prog.iop_template_var(OperandKind::Src, 3 * batch_index); + let src_to = prog.iop_template_var(OperandKind::Src, 3 * batch_index + 1); // Src Amount -> Operand - let src_amount = prog.iop_template_var(OperandKind::Src, 2); - - // Add Comment header - prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string()); + let src_amount = prog.iop_template_var(OperandKind::Src, 3 * batch_index + 2); // TODO: Make this a parameter or sweep this // All these little parameters would be very handy to write an @@ -236,7 +272,7 @@ pub fn iop_erc_20(prog: &mut Program) { let kogge_blk_w = 10; let ripple = true; - let tree = { + { let props = prog.params(); let tfhe_params: asm::DigitParameters = props.clone().into(); let lut = pbs_by_name!("IfFalseZeroed"); @@ -273,13 +309,26 @@ pub fn iop_erc_20(prog: &mut Program) { kogge::add(prog, dst_to, src_to, src_amount.clone(), None, kogge_blk_w) + kogge::sub(prog, dst_from, src_from, src_amount, kogge_blk_w) } - }; - tree.add_to_prog(prog); + } +} + +/// A SIMD implementation of add for maximum throughput +#[instrument(level = "trace", skip(prog))] +pub fn iop_add_ripple_rtl(prog: &mut Program, i: u8) -> Rtl { + // Allocate metavariables: + let dst = prog.iop_template_var(OperandKind::Dst, i); + let src_a = prog.iop_template_var(OperandKind::Src, 2 * i); + let src_b = prog.iop_template_var(OperandKind::Src, 2 * i + 1); + + // Convert MetaVarCell in VarCell for Rtl analysis + let a = VarCell::from_vec(src_a); + let b = VarCell::from_vec(src_b); + let d = VarCell::from_vec(dst); + + // Do a + b with the ripple carry adder + kogge::ripple_add(d, a, b, None) } -// ---------------------------------------------------------------------------- -// Helper Functions -// ---------------------------------------------------------------------------- fn iop_addx( prog: &mut Program, dst: Vec, @@ -471,7 +520,12 @@ pub fn iop_mulx( // Note: The break-even point might not be this one, but choosing the right // point is uninportant since we'll leap imensely the number of batches from // FPGA to ASIC. - if prog.params().pbs_batch_w >= dst.len() { + let parallel = prog + .op_cfg() + .parallel + .unwrap_or_else(|| prog.params().pbs_batch_w >= dst.len()); + + if parallel { iop_mulx_par(prog, dst, src_a, src_b) } else { iop_mulx_ser(prog, dst, src_a, src_b) @@ -708,3 +762,24 @@ fn bw_inv(prog: &mut Program, b: Vec) -> Vec { }) .collect::>() } + +/// Creates a SIMD version of the closure +/// Make sure that the closure is a PBS optimized version of the operation +/// The closure receives as inputs the program and the batch index. +/// How the ASM operands are actually organized is defined by the closure +/// itself. +/// +/// Maybe this should go into a SIMD firmware implementation... At some point we +/// would need a mechanism to choose between implementations on the fly to make +/// real good use of all of this. + +fn simd(prog: &mut Program, batch_size: usize, rtl_closure: F) +where + F: Fn(&mut Program, u8) -> Rtl, +{ + (0..batch_size) + .map(|i| i as u8) + .map(|i| rtl_closure(prog, i)) + .sum::() + .add_to_prog(prog); +} diff --git a/backends/tfhe-hpu-backend/src/fw/rtl/config.rs b/backends/tfhe-hpu-backend/src/fw/rtl/config.rs index 3e3f82da7..eae6f3966 100644 --- a/backends/tfhe-hpu-backend/src/fw/rtl/config.rs +++ b/backends/tfhe-hpu-backend/src/fw/rtl/config.rs @@ -12,6 +12,8 @@ pub struct OpCfg { pub flush: bool, /// Whether to use latency tiers when scheduling pub use_tiers: bool, + /// Whether to use a massively parallel implementation + pub parallel: Option, } #[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] diff --git a/backends/tfhe-hpu-backend/src/fw/rtl/mod.rs b/backends/tfhe-hpu-backend/src/fw/rtl/mod.rs index 53d83109c..be5f932dc 100644 --- a/backends/tfhe-hpu-backend/src/fw/rtl/mod.rs +++ b/backends/tfhe-hpu-backend/src/fw/rtl/mod.rs @@ -1616,6 +1616,12 @@ impl Rtl { } } +impl Default for Rtl { + fn default() -> Self { + Rtl(Vec::new()) + } +} + impl std::ops::Add for Rtl { type Output = Rtl; fn add(self, rhs: Rtl) -> Self::Output { @@ -1623,6 +1629,12 @@ impl std::ops::Add for Rtl { } } +impl std::iter::Sum for Rtl { + fn sum>(iter: I) -> Self { + iter.fold(Rtl::default(), |acc, x| acc + x) + } +} + impl Drop for Rtl { fn drop(&mut self) { self.unload(); diff --git a/backends/tfhe-hpu-backend/src/isc_trace/fmt.rs b/backends/tfhe-hpu-backend/src/isc_trace/fmt.rs index 4902a6450..47c71d9d5 100644 --- a/backends/tfhe-hpu-backend/src/isc_trace/fmt.rs +++ b/backends/tfhe-hpu-backend/src/isc_trace/fmt.rs @@ -64,13 +64,13 @@ pub struct IscPoolState { pub(super) vld: bool, pub(super) wr_lock: u32, pub(super) rd_lock: u32, - //pub(super) issue_lock: u32, + pub(super) issue_lock: u32, pub(super) sync_id: u32, } impl Len for IscPoolState { fn len() -> usize { - 21 + 28 } } @@ -85,8 +85,8 @@ where vld: *(slice.get(2).ok_or(NoMoreBits)?), wr_lock: slice.get(3..10).ok_or(NoMoreBits)?.load::(), rd_lock: slice.get(10..17).ok_or(NoMoreBits)?.load::(), - //issue_lock: slice.get(17..24).ok_or(NoMoreBits)?.load::(), - sync_id: slice.get(17..21).ok_or(NoMoreBits)?.load::(), + issue_lock: slice.get(17..24).ok_or(NoMoreBits)?.load::(), + sync_id: slice.get(24..28).ok_or(NoMoreBits)?.load::(), }) } }