feat: add warmup for CudaStream (#422)

## Describe the changes

Add a non-blocking `warmup` function to `CudaStream` 

> when you run the benchmark (e.g. the msm example you have) the first
instance is always slow, with a constant overhead of 200~300ms cuda
stream warmup. and I want to get rid of that in my application by
warming it up in parallel while my host do something else.
This commit is contained in:
DmytroTym
2024-03-07 19:11:34 +02:00
committed by GitHub
5 changed files with 30 additions and 9 deletions

View File

@@ -9,7 +9,7 @@ on:
jobs:
test-deploy:
name: Test deployment of docs webiste
name: Test deployment of docs website
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3

View File

@@ -1,7 +1,7 @@
use crate::curve::{Affine, Curve, Projective};
use crate::msm::{msm, MSMConfig, MSM};
use crate::traits::{FieldImpl, GenerateRandom};
use icicle_cuda_runtime::device::{get_device_count, set_device};
use icicle_cuda_runtime::device::{get_device_count, set_device, warmup};
use icicle_cuda_runtime::memory::HostOrDeviceSlice;
use icicle_cuda_runtime::stream::CudaStream;
use rayon::iter::IntoParallelIterator;
@@ -108,6 +108,8 @@ where
{
let test_sizes = [1000, 1 << 16];
let batch_sizes = [1, 3, 1 << 4];
let stream = CudaStream::create().unwrap();
warmup(&stream).unwrap();
for test_size in test_sizes {
for batch_size in batch_sizes {
let points = generate_random_affine_points_with_zeroes(test_size, 10);
@@ -123,7 +125,6 @@ where
let mut msm_results_1 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap();
let mut msm_results_2 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap();
let mut points_d = HostOrDeviceSlice::cuda_malloc(test_size * batch_size).unwrap();
let stream = CudaStream::create().unwrap();
points_d
.copy_from_host_async(&points_cloned, &stream)
.unwrap();
@@ -147,9 +148,6 @@ where
stream
.synchronize()
.unwrap();
stream
.destroy()
.unwrap();
let points_ark: Vec<_> = points_h
.as_slice()
@@ -172,6 +170,9 @@ where
}
}
}
stream
.destroy()
.unwrap();
}
pub fn check_msm_skewed_distributions<C: Curve + MSM<C>>()

View File

@@ -77,6 +77,7 @@ fn main() {
.allowlist_function("cudaMemset")
.allowlist_function("cudaMemsetAsync")
.allowlist_function("cudaDeviceGetDefaultMemPool")
.allowlist_function("cudaMemGetInfo")
.rustified_enum("cudaMemcpyKind")
// Stream Ordered Memory Allocator
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html

View File

@@ -1,7 +1,9 @@
use crate::{
bindings::{cudaGetDevice, cudaGetDeviceCount, cudaSetDevice},
bindings::{cudaFreeAsync, cudaGetDevice, cudaGetDeviceCount, cudaMallocAsync, cudaMemGetInfo, cudaSetDevice},
error::{CudaResult, CudaResultWrap},
stream::CudaStream,
};
use std::mem::MaybeUninit;
pub fn set_device(device_id: usize) -> CudaResult<()> {
unsafe { cudaSetDevice(device_id as i32) }.wrap()
@@ -16,3 +18,16 @@ pub fn get_device() -> CudaResult<usize> {
let mut device_id = 0;
unsafe { cudaGetDevice(&mut device_id) }.wrap_value(device_id as usize)
}
// This function pre-allocates default memory pool and warms the GPU up
// so that subsequent memory allocations and other calls are not slowed down
pub fn warmup(stream: &CudaStream) -> CudaResult<()> {
let mut device_ptr = MaybeUninit::<*mut std::ffi::c_void>::uninit();
let mut free_memory: usize = 0;
let mut _total_memory: usize = 0;
unsafe {
cudaMemGetInfo(&mut free_memory as *mut usize, &mut _total_memory as *mut usize).wrap()?;
cudaMallocAsync(device_ptr.as_mut_ptr(), free_memory >> 1, stream.handle).wrap()?;
cudaFreeAsync(device_ptr.assume_init(), stream.handle).wrap()
}
}

View File

@@ -47,14 +47,18 @@ impl<'a, T> HostOrDeviceSlice<'a, T> {
pub fn as_mut_slice(&mut self) -> &mut [T] {
match self {
Self::Device(_, _) => panic!("Use copy_to_host and copy_to_host_async to move device data to a slice"),
Self::Device(_, _) => {
panic!("Use copy_to_host and copy_to_host_async to move device data to a slice")
}
Self::Host(v) => v.as_mut_slice(),
}
}
pub fn as_slice(&self) -> &[T] {
match self {
Self::Device(_, _) => panic!("Use copy_to_host and copy_to_host_async to move device data to a slice"),
Self::Device(_, _) => {
panic!("Use copy_to_host and copy_to_host_async to move device data to a slice")
}
Self::Host(v) => v.as_slice(),
}
}