Warmup function

This commit is contained in:
DmytroTym
2024-03-06 18:13:23 +02:00
parent b22aa02e91
commit 7185657ff7
4 changed files with 24 additions and 19 deletions

View File

@@ -1,7 +1,7 @@
use crate::curve::{Affine, Curve, Projective};
use crate::msm::{msm, MSMConfig, MSM};
use crate::traits::{FieldImpl, GenerateRandom};
use icicle_cuda_runtime::device::{get_device_count, set_device};
use icicle_cuda_runtime::device::{get_device_count, set_device, warmup};
use icicle_cuda_runtime::memory::HostOrDeviceSlice;
use icicle_cuda_runtime::stream::CudaStream;
use rayon::iter::IntoParallelIterator;
@@ -108,6 +108,8 @@ where
{
let test_sizes = [1000, 1 << 16];
let batch_sizes = [1, 3, 1 << 4];
let stream = CudaStream::create().unwrap();
warmup(&stream).unwrap();
for test_size in test_sizes {
for batch_size in batch_sizes {
let points = generate_random_affine_points_with_zeroes(test_size, 10);
@@ -123,7 +125,6 @@ where
let mut msm_results_1 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap();
let mut msm_results_2 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap();
let mut points_d = HostOrDeviceSlice::cuda_malloc(test_size * batch_size).unwrap();
let stream = CudaStream::create().unwrap();
points_d
.copy_from_host_async(&points_cloned, &stream)
.unwrap();
@@ -147,9 +148,6 @@ where
stream
.synchronize()
.unwrap();
stream
.destroy()
.unwrap();
let points_ark: Vec<_> = points_h
.as_slice()
@@ -172,6 +170,9 @@ where
}
}
}
stream
.destroy()
.unwrap();
}
pub fn check_msm_skewed_distributions<C: Curve + MSM<C>>()

View File

@@ -77,6 +77,7 @@ fn main() {
.allowlist_function("cudaMemset")
.allowlist_function("cudaMemsetAsync")
.allowlist_function("cudaDeviceGetDefaultMemPool")
.allowlist_function("cudaMemGetInfo")
.rustified_enum("cudaMemcpyKind")
// Stream Ordered Memory Allocator
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html

View File

@@ -1,7 +1,9 @@
use crate::{
bindings::{cudaGetDevice, cudaGetDeviceCount, cudaSetDevice},
bindings::{cudaFreeAsync, cudaGetDevice, cudaGetDeviceCount, cudaMallocAsync, cudaMemGetInfo, cudaSetDevice},
error::{CudaResult, CudaResultWrap},
stream::CudaStream,
};
use std::mem::MaybeUninit;
pub fn set_device(device_id: usize) -> CudaResult<()> {
unsafe { cudaSetDevice(device_id as i32) }.wrap()
@@ -16,3 +18,16 @@ pub fn get_device() -> CudaResult<usize> {
let mut device_id = 0;
unsafe { cudaGetDevice(&mut device_id) }.wrap_value(device_id as usize)
}
// This function pre-allocates default memory pool and warms the GPU up
// so that subsequent memory allocations and other calls are not slowed down
pub fn warmup(stream: &CudaStream) -> CudaResult<()> {
let mut device_ptr = MaybeUninit::<*mut std::ffi::c_void>::uninit();
let mut free_memory: usize = 0;
let mut _total_memory: usize = 0;
unsafe {
cudaMemGetInfo(&mut free_memory as *mut usize, &mut _total_memory as *mut usize).wrap()?;
cudaMallocAsync(device_ptr.as_mut_ptr(), free_memory >> 1, stream.handle).wrap()?;
cudaFreeAsync(device_ptr.assume_init(), stream.handle).wrap()
}
}

View File

@@ -1,5 +1,5 @@
use crate::bindings::{
cudaFree, cudaFreeAsync, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy, cudaMemcpyAsync, cudaMemcpyKind,
cudaFree, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy, cudaMemcpyAsync, cudaMemcpyKind,
};
use crate::device::get_device;
use crate::device_context::check_device;
@@ -118,18 +118,6 @@ impl<'a, T> HostOrDeviceSlice<'a, T> {
}
}
pub fn cuda_free_async(&mut self, stream: &CudaStream) -> CudaResult<()> {
if let Self::Device(s, device_id) = self {
check_device(*device_id);
if !s.is_empty() {
unsafe {
cudaFreeAsync(s.as_mut_ptr() as *mut c_void, stream.handle as *mut _ as *mut _).wrap()?;
}
}
}
Ok(())
}
pub fn copy_from_host(&mut self, val: &[T]) -> CudaResult<()> {
match self {
Self::Device(_, device_id) => check_device(*device_id),