Compare commits

...

1 Commits

Author SHA1 Message Date
Guillermo Oyarzun
c538fb1ce6 chore(gpu): fix mul throughput benchmark 2025-06-20 17:51:20 +02:00

View File

@@ -394,9 +394,11 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
#[cfg(feature = "gpu")]
{
let safety_factor = 0.9f64; // Avoid OOM error in mul bench
let total_num_sm = H100_PCIE_SM_COUNT * get_number_of_gpus();
let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64;
let elements =
(total_num_sm as f64 * block_multiplicator * operation_loading * safety_factor) as u64;
elements.min(1500) // This threshold is useful for operation with both a small number of
// block and low PBs count.
}