Warmup function

2026-01-09 15:37:58 -05:00 · 2024-03-06 18:13:23 +02:00
parent b22aa02e91
commit 7185657ff7
4 changed files with 24 additions and 19 deletions
--- a/wrappers/rust/icicle-core/src/msm/tests.rs
+++ b/wrappers/rust/icicle-core/src/msm/tests.rs
@@ -1,7 +1,7 @@
 use crate::curve::{Affine, Curve, Projective};
 use crate::msm::{msm, MSMConfig, MSM};
 use crate::traits::{FieldImpl, GenerateRandom};
-use icicle_cuda_runtime::device::{get_device_count, set_device};
+use icicle_cuda_runtime::device::{get_device_count, set_device, warmup};
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
 use icicle_cuda_runtime::stream::CudaStream;
 use rayon::iter::IntoParallelIterator;
@@ -108,6 +108,8 @@ where
 {
    let test_sizes = [1000, 1 << 16];
    let batch_sizes = [1, 3, 1 << 4];
+    let stream = CudaStream::create().unwrap();
+    warmup(&stream).unwrap();
    for test_size in test_sizes {
        for batch_size in batch_sizes {
            let points = generate_random_affine_points_with_zeroes(test_size, 10);
@@ -123,7 +125,6 @@ where
            let mut msm_results_1 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap();
            let mut msm_results_2 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap();
            let mut points_d = HostOrDeviceSlice::cuda_malloc(test_size * batch_size).unwrap();
-            let stream = CudaStream::create().unwrap();
            points_d
                .copy_from_host_async(&points_cloned, &stream)
                .unwrap();
@@ -147,9 +148,6 @@ where
            stream
                .synchronize()
                .unwrap();
-            stream
-                .destroy()
-                .unwrap();

            let points_ark: Vec<_> = points_h
                .as_slice()
@@ -172,6 +170,9 @@ where
            }
        }
    }
+    stream
+        .destroy()
+        .unwrap();
 }

 pub fn check_msm_skewed_distributions<C: Curve + MSM<C>>()
--- a/wrappers/rust/icicle-cuda-runtime/build.rs
+++ b/wrappers/rust/icicle-cuda-runtime/build.rs
@@ -77,6 +77,7 @@ fn main() {
        .allowlist_function("cudaMemset")
        .allowlist_function("cudaMemsetAsync")
        .allowlist_function("cudaDeviceGetDefaultMemPool")
+        .allowlist_function("cudaMemGetInfo")
        .rustified_enum("cudaMemcpyKind")
        // Stream Ordered Memory Allocator
        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html
--- a/wrappers/rust/icicle-cuda-runtime/src/device.rs
+++ b/wrappers/rust/icicle-cuda-runtime/src/device.rs
@@ -1,7 +1,9 @@
 use crate::{
-    bindings::{cudaGetDevice, cudaGetDeviceCount, cudaSetDevice},
+    bindings::{cudaFreeAsync, cudaGetDevice, cudaGetDeviceCount, cudaMallocAsync, cudaMemGetInfo, cudaSetDevice},
    error::{CudaResult, CudaResultWrap},
+    stream::CudaStream,
 };
+use std::mem::MaybeUninit;

 pub fn set_device(device_id: usize) -> CudaResult<()> {
    unsafe { cudaSetDevice(device_id as i32) }.wrap()
@@ -16,3 +18,16 @@ pub fn get_device() -> CudaResult<usize> {
    let mut device_id = 0;
    unsafe { cudaGetDevice(&mut device_id) }.wrap_value(device_id as usize)
 }
+
+// This function pre-allocates default memory pool and warms the GPU up
+// so that subsequent memory allocations and other calls are not slowed down
+pub fn warmup(stream: &CudaStream) -> CudaResult<()> {
+    let mut device_ptr = MaybeUninit::<*mut std::ffi::c_void>::uninit();
+    let mut free_memory: usize = 0;
+    let mut _total_memory: usize = 0;
+    unsafe {
+        cudaMemGetInfo(&mut free_memory as *mut usize, &mut _total_memory as *mut usize).wrap()?;
+        cudaMallocAsync(device_ptr.as_mut_ptr(), free_memory >> 1, stream.handle).wrap()?;
+        cudaFreeAsync(device_ptr.assume_init(), stream.handle).wrap()
+    }
+}
--- a/wrappers/rust/icicle-cuda-runtime/src/memory.rs
+++ b/wrappers/rust/icicle-cuda-runtime/src/memory.rs
@@ -1,5 +1,5 @@
 use crate::bindings::{
-    cudaFree, cudaFreeAsync, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy, cudaMemcpyAsync, cudaMemcpyKind,
+    cudaFree, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy, cudaMemcpyAsync, cudaMemcpyKind,
 };
 use crate::device::get_device;
 use crate::device_context::check_device;
@@ -118,18 +118,6 @@ impl<'a, T> HostOrDeviceSlice<'a, T> {
        }
    }

-    pub fn cuda_free_async(&mut self, stream: &CudaStream) -> CudaResult<()> {
-        if let Self::Device(s, device_id) = self {
-            check_device(*device_id);
-            if !s.is_empty() {
-                unsafe {
-                    cudaFreeAsync(s.as_mut_ptr() as *mut c_void, stream.handle as *mut _ as *mut _).wrap()?;
-                }
-            }
-        }
-        Ok(())
-    }
-
    pub fn copy_from_host(&mut self, val: &[T]) -> CudaResult<()> {
        match self {
            Self::Device(_, device_id) => check_device(*device_id),