From e19a869691af9379ddd21abccb4b37d03726f602 Mon Sep 17 00:00:00 2001 From: VitaliiH Date: Mon, 10 Jun 2024 12:24:58 +0200 Subject: [PATCH] accumulate stwo (#535) adds in-place vector addition and api as accumulate --- icicle/include/api/babybear.h | 3 ++ icicle/include/api/bls12_377.h | 3 ++ icicle/include/api/bls12_381.h | 3 ++ icicle/include/api/bn254.h | 3 ++ icicle/include/api/bw6_761.h | 3 ++ icicle/include/api/grumpkin.h | 3 ++ icicle/include/api/stark252.h | 3 ++ icicle/include/api/templates/fields/vec_ops.h | 3 ++ .../api/templates/fields/vec_ops_ext.h | 3 ++ icicle/src/vec_ops/extern.cu | 12 +++++ icicle/src/vec_ops/extern_extension.cu | 11 +++++ icicle/src/vec_ops/vec_ops.cu | 31 ++++++++----- icicle/tests/runner.cu | 3 +- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 45 ++++++++++++++++++- .../rust/icicle-core/src/vec_ops/tests.rs | 12 ++++- 15 files changed, 127 insertions(+), 14 deletions(-) diff --git a/icicle/include/api/babybear.h b/icicle/include/api/babybear.h index 9b072691..8bb27f4b 100644 --- a/icicle/include/api/babybear.h +++ b/icicle/include/api/babybear.h @@ -56,6 +56,9 @@ extern "C" cudaError_t babybear_mul_cuda( extern "C" cudaError_t babybear_add_cuda( babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result); +extern "C" cudaError_t babybear_accumulate_cuda( + babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t babybear_sub_cuda( babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result); diff --git a/icicle/include/api/bls12_377.h b/icicle/include/api/bls12_377.h index d1123c0c..7044dedb 100644 --- a/icicle/include/api/bls12_377.h +++ b/icicle/include/api/bls12_377.h @@ -104,6 +104,9 @@ extern "C" cudaError_t bls12_377_mul_cuda( extern "C" cudaError_t bls12_377_add_cuda( bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result); +extern "C" cudaError_t bls12_377_accumulate_cuda( + bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t bls12_377_sub_cuda( bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result); diff --git a/icicle/include/api/bls12_381.h b/icicle/include/api/bls12_381.h index dcca3d17..71d3efa2 100644 --- a/icicle/include/api/bls12_381.h +++ b/icicle/include/api/bls12_381.h @@ -104,6 +104,9 @@ extern "C" cudaError_t bls12_381_mul_cuda( extern "C" cudaError_t bls12_381_add_cuda( bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result); +extern "C" cudaError_t bls12_381_accumulate_cuda( + bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t bls12_381_sub_cuda( bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result); diff --git a/icicle/include/api/bn254.h b/icicle/include/api/bn254.h index 9b9e7bb0..14f4479a 100644 --- a/icicle/include/api/bn254.h +++ b/icicle/include/api/bn254.h @@ -136,6 +136,9 @@ extern "C" cudaError_t bn254_mul_cuda( extern "C" cudaError_t bn254_add_cuda( bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result); +extern "C" cudaError_t bn254_accumulate_cuda( + bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t bn254_sub_cuda( bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result); diff --git a/icicle/include/api/bw6_761.h b/icicle/include/api/bw6_761.h index 78877da5..69d33f48 100644 --- a/icicle/include/api/bw6_761.h +++ b/icicle/include/api/bw6_761.h @@ -104,6 +104,9 @@ extern "C" cudaError_t bw6_761_mul_cuda( extern "C" cudaError_t bw6_761_add_cuda( bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result); +extern "C" cudaError_t bw6_761_accumulate_cuda( + bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t bw6_761_sub_cuda( bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result); diff --git a/icicle/include/api/grumpkin.h b/icicle/include/api/grumpkin.h index c72be91a..09209dba 100644 --- a/icicle/include/api/grumpkin.h +++ b/icicle/include/api/grumpkin.h @@ -74,6 +74,9 @@ extern "C" cudaError_t grumpkin_mul_cuda( extern "C" cudaError_t grumpkin_add_cuda( grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result); +extern "C" cudaError_t grumpkin_accumulate_cuda( + grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t grumpkin_sub_cuda( grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result); diff --git a/icicle/include/api/stark252.h b/icicle/include/api/stark252.h index c04702a1..f6248af7 100644 --- a/icicle/include/api/stark252.h +++ b/icicle/include/api/stark252.h @@ -19,6 +19,9 @@ extern "C" cudaError_t stark252_mul_cuda( extern "C" cudaError_t stark252_add_cuda( stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result); +extern "C" cudaError_t stark252_accumulate_cuda( + stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t stark252_sub_cuda( stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result); diff --git a/icicle/include/api/templates/fields/vec_ops.h b/icicle/include/api/templates/fields/vec_ops.h index d740c8f7..8cfa4bde 100644 --- a/icicle/include/api/templates/fields/vec_ops.h +++ b/icicle/include/api/templates/fields/vec_ops.h @@ -4,6 +4,9 @@ extern "C" cudaError_t ${FIELD}_mul_cuda( extern "C" cudaError_t ${FIELD}_add_cuda( ${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::scalar_t* result); +extern "C" cudaError_t ${FIELD}_accumulate_cuda( + ${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t ${FIELD}_sub_cuda( ${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::scalar_t* result); diff --git a/icicle/include/api/templates/fields/vec_ops_ext.h b/icicle/include/api/templates/fields/vec_ops_ext.h index 5a6513f9..d2bc2bd3 100644 --- a/icicle/include/api/templates/fields/vec_ops_ext.h +++ b/icicle/include/api/templates/fields/vec_ops_ext.h @@ -4,6 +4,9 @@ extern "C" cudaError_t ${FIELD}_extension_mul_cuda( extern "C" cudaError_t ${FIELD}_extension_add_cuda( ${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::extension_t* result); +extern "C" cudaError_t ${FIELD}_extension_accumulate_cuda( + ${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t ${FIELD}_extension_sub_cuda( ${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::extension_t* result); diff --git a/icicle/src/vec_ops/extern.cu b/icicle/src/vec_ops/extern.cu index fc1d1ac6..78a7467c 100644 --- a/icicle/src/vec_ops/extern.cu +++ b/icicle/src/vec_ops/extern.cu @@ -30,6 +30,18 @@ namespace vec_ops { return add(vec_a, vec_b, n, config, result); } + /** + * Accumulate (as vec_a[i] += vec_b[i]) function with the template parameter + * `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE` + * or standalone "STARK field" given by `-DFIELD`). + * @return `cudaSuccess` if the execution was successful and an error code otherwise. + */ + extern "C" cudaError_t + CONCAT_EXPAND(FIELD, accumulate_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config) + { + return add(vec_a, vec_b, n, config, vec_a); + } + /** * Extern version of [Sub](@ref Sub) function with the template parameter * `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE` diff --git a/icicle/src/vec_ops/extern_extension.cu b/icicle/src/vec_ops/extern_extension.cu index 12927aec..80653c79 100644 --- a/icicle/src/vec_ops/extern_extension.cu +++ b/icicle/src/vec_ops/extern_extension.cu @@ -29,6 +29,17 @@ namespace vec_ops { return add(vec_a, vec_b, n, config, result); } + /** + * Accumulate (as vec_a[i] += vec_b[i]) function with the template parameter + * `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build. + * @return `cudaSuccess` if the execution was successful and an error code otherwise. + */ + extern "C" cudaError_t + CONCAT_EXPAND(FIELD, extension_accumulate_cuda)(extension_t* vec_a, extension_t* vec_b, int n, VecOpsConfig& config) + { + return add(vec_a, vec_b, n, config, vec_a); + } + /** * Extern version of [Sub](@ref Sub) function with the template parameter * `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build. diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index f7bf7479..fef581ec 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -82,16 +82,19 @@ namespace vec_ops { } // namespace template - cudaError_t vec_op(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result) + cudaError_t vec_op(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result) { CHK_INIT_IF_RETURN(); + bool is_in_place = vec_a == result; + // Set the grid and block dimensions int num_threads = MAX_THREADS_PER_BLOCK; int num_blocks = (n + num_threads - 1) / num_threads; E *d_result, *d_alloc_vec_a, *d_alloc_vec_b; - const E *d_vec_a, *d_vec_b; + E* d_vec_a; + const E* d_vec_b; if (!config.is_a_on_device) { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); @@ -109,41 +112,49 @@ namespace vec_ops { } if (!config.is_result_on_device) { - CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream)); + if (!is_in_place) { + CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream)); + } else { + d_result = d_vec_a; + } } else { - d_result = result; + if (!is_in_place) { + d_result = result; + } else { + d_result = result = d_vec_a; + } } // Call the kernel to perform element-wise operation Kernel<<>>(d_vec_a, d_vec_b, n, d_result); - if (!config.is_a_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); } - if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); } - if (!config.is_result_on_device) { CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream)); CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream)); } + if (!config.is_a_on_device && !is_in_place) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); } + if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); } + if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream)); return CHK_LAST(); } template - cudaError_t mul(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result) + cudaError_t mul(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result) { return vec_op(vec_a, vec_b, n, config, result); } template - cudaError_t add(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result) + cudaError_t add(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result) { return vec_op(vec_a, vec_b, n, config, result); } template - cudaError_t sub(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result) + cudaError_t sub(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result) { return vec_op(vec_a, vec_b, n, config, result); } diff --git a/icicle/tests/runner.cu b/icicle/tests/runner.cu index 6be2b2a5..330b82c5 100644 --- a/icicle/tests/runner.cu +++ b/icicle/tests/runner.cu @@ -3,12 +3,13 @@ #include // include list of test files -// Ensure the device_error_test.cu is last to prevent aborting mid-test run #include "field_test.cu" #ifdef CURVE_ID #include "curve_test.cu" #endif #include "error_handler_test.cu" + +// Ensure the device_error_test.cu is last to prevent aborting mid-test run #include "device_error_test.cu" int main(int argc, char** argv) diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index 900f41f4..53e06d11 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -83,6 +83,12 @@ pub trait VecOps { cfg: &VecOpsConfig, ) -> IcicleResult<()>; + fn accumulate( + a: &mut (impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, + ) -> IcicleResult<()>; + fn sub( a: &(impl HostOrDeviceSlice + ?Sized), b: &(impl HostOrDeviceSlice + ?Sized), @@ -207,6 +213,19 @@ where <::Config as VecOps>::add(a, b, result, &cfg) } +pub fn accumulate_scalars( + a: &mut (impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, +) -> IcicleResult<()> +where + F: FieldImpl, + ::Config: VecOps, +{ + let cfg = check_vec_ops_args(a, b, a, cfg); + <::Config as VecOps>::accumulate(a, b, &cfg) +} + pub fn sub_scalars( a: &(impl HostOrDeviceSlice + ?Sized), b: &(impl HostOrDeviceSlice + ?Sized), @@ -299,6 +318,14 @@ macro_rules! impl_vec_ops_field { result: *mut $field, ) -> CudaError; + #[link_name = concat!($field_prefix, "_accumulate_cuda")] + pub(crate) fn accumulate_scalars_cuda( + a: *mut $field, + b: *const $field, + size: u32, + cfg: *const VecOpsConfig, + ) -> CudaError; + #[link_name = concat!($field_prefix, "_sub_cuda")] pub(crate) fn sub_scalars_cuda( a: *const $field, @@ -357,6 +384,22 @@ macro_rules! impl_vec_ops_field { } } + fn accumulate( + a: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + b: &(impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &VecOpsConfig, + ) -> IcicleResult<()> { + unsafe { + $field_prefix_ident::accumulate_scalars_cuda( + a.as_mut_ptr(), + b.as_ptr(), + a.len() as u32, + cfg as *const VecOpsConfig, + ) + .wrap() + } + } + fn sub( a: &(impl HostOrDeviceSlice<$field> + ?Sized), b: &(impl HostOrDeviceSlice<$field> + ?Sized), @@ -457,7 +500,7 @@ macro_rules! impl_vec_add_tests { ) => { #[test] pub fn test_vec_add_scalars() { - check_vec_ops_scalars::<$field>() + check_vec_ops_scalars::<$field>(); } #[test] diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs index 1b6cd46d..8bb21a15 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs @@ -5,19 +5,21 @@ use crate::vec_ops::{ }; use icicle_cuda_runtime::memory::{DeviceVec, HostSlice}; +use super::accumulate_scalars; + pub fn check_vec_ops_scalars() where ::Config: VecOps + GenerateRandom, { let test_size = 1 << 14; - let a = F::Config::generate_random(test_size); + let mut a = F::Config::generate_random(test_size); let b = F::Config::generate_random(test_size); let ones = vec![F::one(); test_size]; let mut result = vec![F::zero(); test_size]; let mut result2 = vec![F::zero(); test_size]; let mut result3 = vec![F::zero(); test_size]; - let a = HostSlice::from_slice(&a); + let a = HostSlice::from_mut_slice(&mut a); let b = HostSlice::from_slice(&b); let ones = HostSlice::from_slice(&ones); let result = HostSlice::from_mut_slice(&mut result); @@ -34,6 +36,12 @@ where mul_scalars(a, ones, result3, &cfg).unwrap(); assert_eq!(a[0], result3[0]); + + add_scalars(a, b, result, &cfg).unwrap(); + + accumulate_scalars(a, b, &cfg).unwrap(); + + assert_eq!(a[0], result[0]); } pub fn check_bit_reverse()