mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-09 13:07:59 -05:00
feat: add warmup for CudaStream (#422)
## Describe the changes Add a non-blocking `warmup` function to `CudaStream` > when you run the benchmark (e.g. the msm example you have) the first instance is always slow, with a constant overhead of 200~300ms cuda stream warmup. and I want to get rid of that in my application by warming it up in parallel while my host do something else.
This commit is contained in:
2
.github/workflows/test-deploy-docs.yml
vendored
2
.github/workflows/test-deploy-docs.yml
vendored
@@ -9,7 +9,7 @@ on:
|
||||
|
||||
jobs:
|
||||
test-deploy:
|
||||
name: Test deployment of docs webiste
|
||||
name: Test deployment of docs website
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::curve::{Affine, Curve, Projective};
|
||||
use crate::msm::{msm, MSMConfig, MSM};
|
||||
use crate::traits::{FieldImpl, GenerateRandom};
|
||||
use icicle_cuda_runtime::device::{get_device_count, set_device};
|
||||
use icicle_cuda_runtime::device::{get_device_count, set_device, warmup};
|
||||
use icicle_cuda_runtime::memory::HostOrDeviceSlice;
|
||||
use icicle_cuda_runtime::stream::CudaStream;
|
||||
use rayon::iter::IntoParallelIterator;
|
||||
@@ -108,6 +108,8 @@ where
|
||||
{
|
||||
let test_sizes = [1000, 1 << 16];
|
||||
let batch_sizes = [1, 3, 1 << 4];
|
||||
let stream = CudaStream::create().unwrap();
|
||||
warmup(&stream).unwrap();
|
||||
for test_size in test_sizes {
|
||||
for batch_size in batch_sizes {
|
||||
let points = generate_random_affine_points_with_zeroes(test_size, 10);
|
||||
@@ -123,7 +125,6 @@ where
|
||||
let mut msm_results_1 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap();
|
||||
let mut msm_results_2 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap();
|
||||
let mut points_d = HostOrDeviceSlice::cuda_malloc(test_size * batch_size).unwrap();
|
||||
let stream = CudaStream::create().unwrap();
|
||||
points_d
|
||||
.copy_from_host_async(&points_cloned, &stream)
|
||||
.unwrap();
|
||||
@@ -147,9 +148,6 @@ where
|
||||
stream
|
||||
.synchronize()
|
||||
.unwrap();
|
||||
stream
|
||||
.destroy()
|
||||
.unwrap();
|
||||
|
||||
let points_ark: Vec<_> = points_h
|
||||
.as_slice()
|
||||
@@ -172,6 +170,9 @@ where
|
||||
}
|
||||
}
|
||||
}
|
||||
stream
|
||||
.destroy()
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
pub fn check_msm_skewed_distributions<C: Curve + MSM<C>>()
|
||||
|
||||
@@ -77,6 +77,7 @@ fn main() {
|
||||
.allowlist_function("cudaMemset")
|
||||
.allowlist_function("cudaMemsetAsync")
|
||||
.allowlist_function("cudaDeviceGetDefaultMemPool")
|
||||
.allowlist_function("cudaMemGetInfo")
|
||||
.rustified_enum("cudaMemcpyKind")
|
||||
// Stream Ordered Memory Allocator
|
||||
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use crate::{
|
||||
bindings::{cudaGetDevice, cudaGetDeviceCount, cudaSetDevice},
|
||||
bindings::{cudaFreeAsync, cudaGetDevice, cudaGetDeviceCount, cudaMallocAsync, cudaMemGetInfo, cudaSetDevice},
|
||||
error::{CudaResult, CudaResultWrap},
|
||||
stream::CudaStream,
|
||||
};
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
pub fn set_device(device_id: usize) -> CudaResult<()> {
|
||||
unsafe { cudaSetDevice(device_id as i32) }.wrap()
|
||||
@@ -16,3 +18,16 @@ pub fn get_device() -> CudaResult<usize> {
|
||||
let mut device_id = 0;
|
||||
unsafe { cudaGetDevice(&mut device_id) }.wrap_value(device_id as usize)
|
||||
}
|
||||
|
||||
// This function pre-allocates default memory pool and warms the GPU up
|
||||
// so that subsequent memory allocations and other calls are not slowed down
|
||||
pub fn warmup(stream: &CudaStream) -> CudaResult<()> {
|
||||
let mut device_ptr = MaybeUninit::<*mut std::ffi::c_void>::uninit();
|
||||
let mut free_memory: usize = 0;
|
||||
let mut _total_memory: usize = 0;
|
||||
unsafe {
|
||||
cudaMemGetInfo(&mut free_memory as *mut usize, &mut _total_memory as *mut usize).wrap()?;
|
||||
cudaMallocAsync(device_ptr.as_mut_ptr(), free_memory >> 1, stream.handle).wrap()?;
|
||||
cudaFreeAsync(device_ptr.assume_init(), stream.handle).wrap()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,14 +47,18 @@ impl<'a, T> HostOrDeviceSlice<'a, T> {
|
||||
|
||||
pub fn as_mut_slice(&mut self) -> &mut [T] {
|
||||
match self {
|
||||
Self::Device(_, _) => panic!("Use copy_to_host and copy_to_host_async to move device data to a slice"),
|
||||
Self::Device(_, _) => {
|
||||
panic!("Use copy_to_host and copy_to_host_async to move device data to a slice")
|
||||
}
|
||||
Self::Host(v) => v.as_mut_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_slice(&self) -> &[T] {
|
||||
match self {
|
||||
Self::Device(_, _) => panic!("Use copy_to_host and copy_to_host_async to move device data to a slice"),
|
||||
Self::Device(_, _) => {
|
||||
panic!("Use copy_to_host and copy_to_host_async to move device data to a slice")
|
||||
}
|
||||
Self::Host(v) => v.as_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user