feat: add warmup for CudaStream (#422)

## Describe the changes Add a non-blocking `warmup` function to `CudaStream` > when you run the benchmark (e.g. the msm example you have) the first instance is always slow, with a constant overhead of 200~300ms cuda stream warmup. and I want to get rid of that in my application by warming it up in parallel while my host do something else.
2026-01-09 13:07:59 -05:00 · 2024-03-07 19:11:34 +02:00
parent 1abd2ef9c9 d8059a2a4e
commit 0e84fb4b76
5 changed files with 30 additions and 9 deletions
--- a/.github/workflows/test-deploy-docs.yml
+++ b/.github/workflows/test-deploy-docs.yml
@@ -9,7 +9,7 @@ on:

 jobs:
  test-deploy:
-    name: Test deployment of docs webiste
+    name: Test deployment of docs website
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
--- a/wrappers/rust/icicle-core/src/msm/tests.rs
+++ b/wrappers/rust/icicle-core/src/msm/tests.rs
@@ -1,7 +1,7 @@
 use crate::curve::{Affine, Curve, Projective};
 use crate::msm::{msm, MSMConfig, MSM};
 use crate::traits::{FieldImpl, GenerateRandom};
-use icicle_cuda_runtime::device::{get_device_count, set_device};
+use icicle_cuda_runtime::device::{get_device_count, set_device, warmup};
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
 use icicle_cuda_runtime::stream::CudaStream;
 use rayon::iter::IntoParallelIterator;
@@ -108,6 +108,8 @@ where
 {
    let test_sizes = [1000, 1 << 16];
    let batch_sizes = [1, 3, 1 << 4];
+    let stream = CudaStream::create().unwrap();
+    warmup(&stream).unwrap();
    for test_size in test_sizes {
        for batch_size in batch_sizes {
            let points = generate_random_affine_points_with_zeroes(test_size, 10);
@@ -123,7 +125,6 @@ where
            let mut msm_results_1 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap();
            let mut msm_results_2 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap();
            let mut points_d = HostOrDeviceSlice::cuda_malloc(test_size * batch_size).unwrap();
-            let stream = CudaStream::create().unwrap();
            points_d
                .copy_from_host_async(&points_cloned, &stream)
                .unwrap();
@@ -147,9 +148,6 @@ where
            stream
                .synchronize()
                .unwrap();
-            stream
-                .destroy()
-                .unwrap();

            let points_ark: Vec<_> = points_h
                .as_slice()
@@ -172,6 +170,9 @@ where
            }
        }
    }
+    stream
+        .destroy()
+        .unwrap();
 }

 pub fn check_msm_skewed_distributions<C: Curve + MSM<C>>()
--- a/wrappers/rust/icicle-cuda-runtime/build.rs
+++ b/wrappers/rust/icicle-cuda-runtime/build.rs
@@ -77,6 +77,7 @@ fn main() {
        .allowlist_function("cudaMemset")
        .allowlist_function("cudaMemsetAsync")
        .allowlist_function("cudaDeviceGetDefaultMemPool")
+        .allowlist_function("cudaMemGetInfo")
        .rustified_enum("cudaMemcpyKind")
        // Stream Ordered Memory Allocator
        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html
--- a/wrappers/rust/icicle-cuda-runtime/src/device.rs
+++ b/wrappers/rust/icicle-cuda-runtime/src/device.rs
@@ -1,7 +1,9 @@
 use crate::{
-    bindings::{cudaGetDevice, cudaGetDeviceCount, cudaSetDevice},
+    bindings::{cudaFreeAsync, cudaGetDevice, cudaGetDeviceCount, cudaMallocAsync, cudaMemGetInfo, cudaSetDevice},
    error::{CudaResult, CudaResultWrap},
+    stream::CudaStream,
 };
+use std::mem::MaybeUninit;

 pub fn set_device(device_id: usize) -> CudaResult<()> {
    unsafe { cudaSetDevice(device_id as i32) }.wrap()
@@ -16,3 +18,16 @@ pub fn get_device() -> CudaResult<usize> {
    let mut device_id = 0;
    unsafe { cudaGetDevice(&mut device_id) }.wrap_value(device_id as usize)
 }
+
+// This function pre-allocates default memory pool and warms the GPU up
+// so that subsequent memory allocations and other calls are not slowed down
+pub fn warmup(stream: &CudaStream) -> CudaResult<()> {
+    let mut device_ptr = MaybeUninit::<*mut std::ffi::c_void>::uninit();
+    let mut free_memory: usize = 0;
+    let mut _total_memory: usize = 0;
+    unsafe {
+        cudaMemGetInfo(&mut free_memory as *mut usize, &mut _total_memory as *mut usize).wrap()?;
+        cudaMallocAsync(device_ptr.as_mut_ptr(), free_memory >> 1, stream.handle).wrap()?;
+        cudaFreeAsync(device_ptr.assume_init(), stream.handle).wrap()
+    }
+}
--- a/wrappers/rust/icicle-cuda-runtime/src/memory.rs
+++ b/wrappers/rust/icicle-cuda-runtime/src/memory.rs
@@ -47,14 +47,18 @@ impl<'a, T> HostOrDeviceSlice<'a, T> {

    pub fn as_mut_slice(&mut self) -> &mut [T] {
        match self {
-            Self::Device(_, _) => panic!("Use copy_to_host and copy_to_host_async to move device data to a slice"),
+            Self::Device(_, _) => {
+                panic!("Use copy_to_host and copy_to_host_async to move device data to a slice")
+            }
            Self::Host(v) => v.as_mut_slice(),
        }
    }

    pub fn as_slice(&self) -> &[T] {
        match self {
-            Self::Device(_, _) => panic!("Use copy_to_host and copy_to_host_async to move device data to a slice"),
+            Self::Device(_, _) => {
+                panic!("Use copy_to_host and copy_to_host_async to move device data to a slice")
+            }
            Self::Host(v) => v.as_slice(),
        }
    }