rust classic benches with Criterion for ecntt/msm/ntt (#499)

Rust idiomatic benches for EC NTT, NTT, MSM
This commit is contained in:
VitaliiH
2024-05-05 10:28:41 +02:00
committed by GitHub
parent f6758f3447
commit 34f0212c0d
20 changed files with 380 additions and 14 deletions

View File

@@ -206,23 +206,35 @@ macro_rules! impl_ecntt_bench {
use icicle_core::ntt::NTTDomain;
use icicle_cuda_runtime::device_context::DEFAULT_DEVICE_ID;
let group_id = format!("{} EC NTT", $field_prefix);
let group_id = format!("{} EC NTT ", $field_prefix);
let mut group = c.benchmark_group(&group_id);
group.sampling_mode(SamplingMode::Flat);
group.sample_size(10);
const MAX_SIZE: u64 = 1 << 18;
const MAX_LOG2: u32 = 9; // max length = 2 ^ MAX_LOG2 //TODO: should be limited by device ram only after fix
let max_log2 = env::var("MAX_LOG2")
.unwrap_or_else(|_| MAX_LOG2.to_string())
.parse::<u32>()
.unwrap_or(MAX_LOG2);
const FAST_TWIDDLES_MODE: bool = false;
INIT.get_or_init(move || init_domain::<$field>(MAX_SIZE, DEFAULT_DEVICE_ID, FAST_TWIDDLES_MODE));
INIT.get_or_init(move || init_domain::<$field>(1 << max_log2, DEFAULT_DEVICE_ID, FAST_TWIDDLES_MODE));
for test_size_log2 in [4, 8] {
for batch_size_log2 in [1, 1 << 4, 128] {
let test_size = 1 << test_size_log2;
let batch_size = 1 << batch_size_log2;
let full_size = batch_size * test_size;
if full_size > 1 << max_log2 {
continue;
}
let test_sizes = [1 << 4, 1 << 8];
let batch_sizes = [1, 1 << 4, 128];
for test_size in test_sizes {
for batch_size in batch_sizes {
let points = C::generate_random_projective_points(test_size);
let points = HostSlice::from_slice(&points);
let mut batch_ntt_result = vec![Projective::<C>::zero(); batch_size * test_size];
let mut batch_ntt_result = vec![Projective::<C>::zero(); full_size];
let batch_ntt_result = HostSlice::from_mut_slice(&mut batch_ntt_result);
let mut config = NTTConfig::default();
for is_inverse in [NTTDir::kInverse, NTTDir::kForward] {

View File

@@ -317,3 +317,138 @@ macro_rules! impl_msm_tests {
}
};
}
#[macro_export]
macro_rules! impl_msm_bench {
(
$field_prefix:literal,
$curve:ident
) => {
use criterion::criterion_group;
use criterion::criterion_main;
use criterion::Criterion;
use icicle_core::curve::Affine;
use icicle_core::curve::Curve;
use icicle_core::curve::Projective;
use icicle_core::msm::msm;
use icicle_core::msm::MSMConfig;
use icicle_core::msm::MSM;
use icicle_core::traits::FieldImpl;
use icicle_core::traits::GenerateRandom;
use icicle_cuda_runtime::device::warmup;
use icicle_cuda_runtime::memory::DeviceVec;
use icicle_cuda_runtime::memory::HostOrDeviceSlice;
use icicle_cuda_runtime::memory::HostSlice;
fn msm_for_bench<C: Curve + MSM<C>>(
scalars_h: &(impl HostOrDeviceSlice<C::ScalarField> + ?Sized),
precomputed_points_d: &(impl HostOrDeviceSlice<Affine<C>> + ?Sized),
cfg: &MSMConfig,
msm_results: &mut (impl HostOrDeviceSlice<Projective<C>> + ?Sized),
_seed: u32,
) {
msm(scalars_h, precomputed_points_d, &cfg, msm_results).unwrap();
}
fn check_msm_batch<C: Curve + MSM<C>>(c: &mut Criterion)
where
<C::ScalarField as FieldImpl>::Config: GenerateRandom<C::ScalarField>,
{
use criterion::black_box;
use criterion::SamplingMode;
use std::env;
let group_id = format!("{} MSM ", $field_prefix);
let mut group = c.benchmark_group(&group_id);
group.sampling_mode(SamplingMode::Flat);
group.sample_size(10);
use icicle_core::msm::precompute_bases;
use icicle_core::msm::tests::generate_random_affine_points_with_zeroes;
use icicle_cuda_runtime::stream::CudaStream;
const MAX_LOG2: u32 = 25; // max length = 2 ^ MAX_LOG2
let max_log2 = env::var("MAX_LOG2")
.unwrap_or_else(|_| MAX_LOG2.to_string())
.parse::<u32>()
.unwrap_or(MAX_LOG2);
let stream = CudaStream::create().unwrap();
let mut cfg = MSMConfig::default();
cfg.ctx
.stream = &stream;
cfg.is_async = true;
cfg.large_bucket_factor = 5;
cfg.c = 4;
warmup(&stream).unwrap();
for test_size_log2 in (13u32..max_log2 + 1) {
let test_size = 1 << test_size_log2;
let points = generate_random_affine_points_with_zeroes(test_size, 10);
for precompute_factor in [1, 4, 8] {
let mut precomputed_points_d = DeviceVec::cuda_malloc(precompute_factor * test_size).unwrap();
precompute_bases(
HostSlice::from_slice(&points),
precompute_factor as i32,
0,
&cfg.ctx,
&mut precomputed_points_d,
)
.unwrap();
for batch_size_log2 in [0, 4, 7] {
let batch_size = 1 << batch_size_log2;
let full_size = batch_size * test_size;
if full_size > 1 << max_log2 {
continue;
}
let mut scalars = <C::ScalarField as FieldImpl>::Config::generate_random(full_size);
let scalars = <C::ScalarField as FieldImpl>::Config::generate_random(full_size);
// a version of batched msm without using `cfg.points_size`, requires copying bases
let scalars_h = HostSlice::from_slice(&scalars);
let mut msm_results = DeviceVec::<Projective<C>>::cuda_malloc(batch_size).unwrap();
let mut points_d = DeviceVec::<Affine<C>>::cuda_malloc(full_size).unwrap();
points_d
.copy_from_host_async(HostSlice::from_slice(&points), &stream)
.unwrap();
cfg.precompute_factor = precompute_factor as i32;
let bench_descr = format!(
" {} x {} with precomp = {:?}",
test_size, batch_size, precompute_factor
);
group.bench_function(&bench_descr, |b| {
b.iter(|| {
msm_for_bench(
scalars_h,
&precomputed_points_d[..],
&cfg,
&mut msm_results[..],
black_box(1),
)
})
});
stream
.synchronize()
.unwrap();
}
}
}
stream
.destroy()
.unwrap();
}
criterion_group!(benches, check_msm_batch<$curve>);
criterion_main!(benches);
};
}

View File

@@ -15,7 +15,7 @@ use ark_ec::VariableBaseMSM;
#[cfg(feature = "arkworks")]
use ark_std::{rand::Rng, test_rng, UniformRand};
fn generate_random_affine_points_with_zeroes<C: Curve>(size: usize, num_zeroes: usize) -> Vec<Affine<C>> {
pub fn generate_random_affine_points_with_zeroes<C: Curve>(size: usize, num_zeroes: usize) -> Vec<Affine<C>> {
let rng = &mut test_rng();
let mut points = C::generate_random_affine_points(size);
for _ in 0..num_zeroes {

View File

@@ -414,3 +414,135 @@ macro_rules! impl_ntt_tests {
}
};
}
#[macro_export]
macro_rules! impl_ntt_bench {
(
$field_prefix:literal,
$field:ident
) => {
use icicle_core::ntt::ntt;
use icicle_core::ntt::NTTDomain;
use icicle_cuda_runtime::memory::HostOrDeviceSlice;
use std::sync::OnceLock;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use icicle_core::{
ntt::{FieldImpl, NTTConfig, NTTDir, NttAlgorithm, Ordering},
traits::ArkConvertible,
};
use icicle_core::ntt::NTT;
use icicle_cuda_runtime::memory::HostSlice;
use icicle_core::traits::GenerateRandom;
use icicle_core::vec_ops::VecOps;
fn ntt_for_bench<T, F: FieldImpl>(
input: &(impl HostOrDeviceSlice<F> + ?Sized),
mut batch_ntt_result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
test_sizes: usize,
batch_size: usize,
is_inverse: NTTDir,
ordering: Ordering,
config: &mut NTTConfig<F>,
_seed: u32,
) where
<F as FieldImpl>::Config: NTT<F, F> + GenerateRandom<F>,
<F as FieldImpl>::Config: VecOps<F>,
{
ntt(input, is_inverse, config, batch_ntt_result).unwrap();
}
static INIT: OnceLock<()> = OnceLock::new();
fn benchmark_ntt<T, F: FieldImpl>(c: &mut Criterion)
where
<F as FieldImpl>::Config: NTT<F, F> + GenerateRandom<F>,
<F as FieldImpl>::Config: VecOps<F>,
{
use criterion::SamplingMode;
use icicle_core::ntt::ntt;
use icicle_core::ntt::tests::init_domain;
use icicle_core::ntt::NTTDomain;
use icicle_cuda_runtime::device_context::DEFAULT_DEVICE_ID;
use std::env;
let group_id = format!("{} NTT", $field_prefix);
let mut group = c.benchmark_group(&group_id);
group.sampling_mode(SamplingMode::Flat);
group.sample_size(10);
const MAX_LOG2: u32 = 25; // max length = 2 ^ MAX_LOG2
let max_log2 = env::var("MAX_LOG2")
.unwrap_or_else(|_| MAX_LOG2.to_string())
.parse::<u32>()
.unwrap_or(MAX_LOG2);
const FAST_TWIDDLES_MODE: bool = false;
INIT.get_or_init(move || init_domain::<$field>(1 << max_log2, DEFAULT_DEVICE_ID, FAST_TWIDDLES_MODE));
let coset_generators = [F::one(), F::Config::generate_random(1)[0]];
let mut config = NTTConfig::<F>::default();
for test_size_log2 in (13u32..max_log2 + 1) {
for batch_size_log2 in (7u32..17u32) {
let test_size = 1 << test_size_log2;
let batch_size = 1 << batch_size_log2;
let full_size = batch_size * test_size;
if full_size > 1 << max_log2 {
continue;
}
let scalars = F::Config::generate_random(full_size);
let input = HostSlice::from_slice(&scalars);
let mut batch_ntt_result = vec![F::zero(); batch_size * test_size];
let batch_ntt_result = HostSlice::from_mut_slice(&mut batch_ntt_result);
let mut config = NTTConfig::default();
for is_inverse in [NTTDir::kInverse, NTTDir::kForward] {
for ordering in [
Ordering::kNN,
Ordering::kNR, // times are ~ same as kNN
Ordering::kRN,
Ordering::kRR,
Ordering::kNM,
Ordering::kMN,
] {
config.ordering = ordering;
// for alg in [NttAlgorithm::Radix2, NttAlgorithm::MixedRadix] {
config.batch_size = batch_size as i32;
// config.ntt_algorithm = alg;
let bench_descr = format!(
"{:?} {:?} {} x {}",
ordering, is_inverse, test_size, batch_size
);
group.bench_function(&bench_descr, |b| {
b.iter(|| {
ntt_for_bench::<F, F>(
input,
batch_ntt_result,
test_size,
batch_size,
is_inverse,
ordering,
&mut config,
black_box(1),
)
})
});
// }
}
}
}
}
group.finish();
}
criterion_group!(benches, benchmark_ntt<$field, $field>);
criterion_main!(benches);
};
}

View File

@@ -38,4 +38,12 @@ arkworks = ["ark-bls12-377", "icicle-core/arkworks"]
[[bench]]
name = "ecntt"
harness = false # Criterion provides own harness
harness = false
[[bench]]
name = "ntt"
harness = false
[[bench]]
name = "msm"
harness = false

View File

@@ -4,7 +4,7 @@ use icicle_bls12_377::curve::{CurveCfg, ScalarField};
#[cfg(feature = "ec_ntt")]
use icicle_core::impl_ecntt_bench;
#[cfg(feature = "ec_ntt")]
impl_ecntt_bench!("BLS12_377", ScalarField, CurveCfg);
impl_ecntt_bench!("bls12_377", ScalarField, CurveCfg);
#[cfg(not(feature = "ec_ntt"))]
fn main() {}

View File

@@ -0,0 +1,5 @@
use icicle_bls12_377::curve::CurveCfg;
use icicle_core::impl_msm_bench;
impl_msm_bench!("bls12_377", CurveCfg);

View File

@@ -0,0 +1,5 @@
use icicle_bls12_377::curve::ScalarField;
use icicle_core::impl_ntt_bench;
impl_ntt_bench!("bls12_377", ScalarField);

View File

@@ -36,4 +36,12 @@ arkworks = ["ark-bls12-381", "icicle-core/arkworks"]
[[bench]]
name = "ecntt"
harness = false # Criterion provides own harness
harness = false
[[bench]]
name = "ntt"
harness = false
[[bench]]
name = "msm"
harness = false

View File

@@ -4,7 +4,7 @@ use icicle_bls12_381::curve::{CurveCfg, ScalarField};
#[cfg(feature = "ec_ntt")]
use icicle_core::impl_ecntt_bench;
#[cfg(feature = "ec_ntt")]
impl_ecntt_bench!("BLS12_381", ScalarField, CurveCfg);
impl_ecntt_bench!("bls12_381", ScalarField, CurveCfg);
#[cfg(not(feature = "ec_ntt"))]
fn main() {}

View File

@@ -0,0 +1,5 @@
use icicle_bls12_381::curve::CurveCfg;
use icicle_core::impl_msm_bench;
impl_msm_bench!("bls12_381", CurveCfg);

View File

@@ -0,0 +1,5 @@
use icicle_bls12_381::curve::ScalarField;
use icicle_core::impl_ntt_bench;
impl_ntt_bench!("bls12_381", ScalarField);

View File

@@ -36,4 +36,12 @@ arkworks = ["ark-bn254", "icicle-core/arkworks"]
[[bench]]
name = "ecntt"
harness = false # Criterion provides own harness
harness = false
[[bench]]
name = "ntt"
harness = false
[[bench]]
name = "msm"
harness = false

View File

@@ -0,0 +1,5 @@
use icicle_bn254::curve::CurveCfg;
use icicle_core::impl_msm_bench;
impl_msm_bench!("bn254", CurveCfg);

View File

@@ -0,0 +1,5 @@
use icicle_bn254::curve::ScalarField;
use icicle_core::impl_ntt_bench;
impl_ntt_bench!("bn254", ScalarField);

View File

@@ -33,3 +33,15 @@ default = []
g2 = ["icicle-bls12-377/bw6-761-g2"]
devmode = ["icicle-core/devmode"]
arkworks = ["ark-bw6-761", "icicle-core/arkworks", "icicle-bls12-377/arkworks"]
# [[bench]]
# name = "ecntt" #sane compilation times only with the devmode enabled
# harness = false # Criterion provides own harness
[[bench]]
name = "ntt"
harness = false
[[bench]]
name = "msm"
harness = false

View File

@@ -0,0 +1,5 @@
use icicle_bw6_761::curve::CurveCfg;
use icicle_core::impl_msm_bench;
impl_msm_bench!("bw6_761", CurveCfg);

View File

@@ -0,0 +1,5 @@
use icicle_bw6_761::curve::ScalarField;
use icicle_core::impl_ntt_bench;
impl_ntt_bench!("bw6_761", ScalarField);

View File

@@ -8,6 +8,7 @@ homepage.workspace = true
repository.workspace = true
[dependencies]
criterion = "0.3"
icicle-core = { workspace = true }
icicle-cuda-runtime = { workspace = true }
ark-grumpkin-test = { git = "https://github.com/ingonyama-zk/ark-grumpkin-test.git", optional = true}
@@ -16,6 +17,7 @@ ark-grumpkin-test = { git = "https://github.com/ingonyama-zk/ark-grumpkin-test.g
cmake = "0.1.50"
[dev-dependencies]
criterion = "0.3"
ark-std = "0.4.0"
ark-ff = "0.4.0"
ark-ec = "0.4.0"
@@ -28,3 +30,7 @@ default = []
ec_ntt = ["icicle-core/ec_ntt"]
devmode = ["icicle-core/devmode"]
arkworks = ["ark-grumpkin-test", "icicle-core/arkworks"]
[[bench]]
name = "msm"
harness = false

View File

@@ -0,0 +1,5 @@
use icicle_grumpkin::curve::CurveCfg;
use icicle_core::impl_msm_bench;
impl_msm_bench!("grumpkin", CurveCfg);