accumulate stwo (#535)

adds in-place vector addition and api as accumulate
This commit is contained in:
VitaliiH
2024-06-10 12:24:58 +02:00
committed by GitHub
parent 9c55d888ae
commit e19a869691
15 changed files with 127 additions and 14 deletions

View File

@@ -56,6 +56,9 @@ extern "C" cudaError_t babybear_mul_cuda(
extern "C" cudaError_t babybear_add_cuda(
babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);
extern "C" cudaError_t babybear_accumulate_cuda(
babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t babybear_sub_cuda(
babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);

View File

@@ -104,6 +104,9 @@ extern "C" cudaError_t bls12_377_mul_cuda(
extern "C" cudaError_t bls12_377_add_cuda(
bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);
extern "C" cudaError_t bls12_377_accumulate_cuda(
bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t bls12_377_sub_cuda(
bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);

View File

@@ -104,6 +104,9 @@ extern "C" cudaError_t bls12_381_mul_cuda(
extern "C" cudaError_t bls12_381_add_cuda(
bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);
extern "C" cudaError_t bls12_381_accumulate_cuda(
bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t bls12_381_sub_cuda(
bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);

View File

@@ -136,6 +136,9 @@ extern "C" cudaError_t bn254_mul_cuda(
extern "C" cudaError_t bn254_add_cuda(
bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);
extern "C" cudaError_t bn254_accumulate_cuda(
bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t bn254_sub_cuda(
bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);

View File

@@ -104,6 +104,9 @@ extern "C" cudaError_t bw6_761_mul_cuda(
extern "C" cudaError_t bw6_761_add_cuda(
bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);
extern "C" cudaError_t bw6_761_accumulate_cuda(
bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t bw6_761_sub_cuda(
bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);

View File

@@ -74,6 +74,9 @@ extern "C" cudaError_t grumpkin_mul_cuda(
extern "C" cudaError_t grumpkin_add_cuda(
grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);
extern "C" cudaError_t grumpkin_accumulate_cuda(
grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t grumpkin_sub_cuda(
grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);

View File

@@ -19,6 +19,9 @@ extern "C" cudaError_t stark252_mul_cuda(
extern "C" cudaError_t stark252_add_cuda(
stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);
extern "C" cudaError_t stark252_accumulate_cuda(
stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t stark252_sub_cuda(
stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);

View File

@@ -4,6 +4,9 @@ extern "C" cudaError_t ${FIELD}_mul_cuda(
extern "C" cudaError_t ${FIELD}_add_cuda(
${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::scalar_t* result);
extern "C" cudaError_t ${FIELD}_accumulate_cuda(
${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t ${FIELD}_sub_cuda(
${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::scalar_t* result);

View File

@@ -4,6 +4,9 @@ extern "C" cudaError_t ${FIELD}_extension_mul_cuda(
extern "C" cudaError_t ${FIELD}_extension_add_cuda(
${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::extension_t* result);
extern "C" cudaError_t ${FIELD}_extension_accumulate_cuda(
${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t ${FIELD}_extension_sub_cuda(
${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::extension_t* result);

View File

@@ -30,6 +30,18 @@ namespace vec_ops {
return add<scalar_t>(vec_a, vec_b, n, config, result);
}
/**
* Accumulate (as vec_a[i] += vec_b[i]) function with the template parameter
* `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
* or standalone "STARK field" given by `-DFIELD`).
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t
CONCAT_EXPAND(FIELD, accumulate_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config)
{
return add<scalar_t>(vec_a, vec_b, n, config, vec_a);
}
/**
* Extern version of [Sub](@ref Sub) function with the template parameter
* `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`

View File

@@ -29,6 +29,17 @@ namespace vec_ops {
return add<extension_t>(vec_a, vec_b, n, config, result);
}
/**
* Accumulate (as vec_a[i] += vec_b[i]) function with the template parameter
* `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t
CONCAT_EXPAND(FIELD, extension_accumulate_cuda)(extension_t* vec_a, extension_t* vec_b, int n, VecOpsConfig& config)
{
return add<extension_t>(vec_a, vec_b, n, config, vec_a);
}
/**
* Extern version of [Sub](@ref Sub) function with the template parameter
* `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.

View File

@@ -82,16 +82,19 @@ namespace vec_ops {
} // namespace
template <typename E, void (*Kernel)(const E*, const E*, int, E*)>
cudaError_t vec_op(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
cudaError_t vec_op(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
{
CHK_INIT_IF_RETURN();
bool is_in_place = vec_a == result;
// Set the grid and block dimensions
int num_threads = MAX_THREADS_PER_BLOCK;
int num_blocks = (n + num_threads - 1) / num_threads;
E *d_result, *d_alloc_vec_a, *d_alloc_vec_b;
const E *d_vec_a, *d_vec_b;
E* d_vec_a;
const E* d_vec_b;
if (!config.is_a_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
@@ -109,41 +112,49 @@ namespace vec_ops {
}
if (!config.is_result_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
if (!is_in_place) {
CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
} else {
d_result = d_vec_a;
}
} else {
d_result = result;
if (!is_in_place) {
d_result = result;
} else {
d_result = result = d_vec_a;
}
}
// Call the kernel to perform element-wise operation
Kernel<<<num_blocks, num_threads, 0, config.ctx.stream>>>(d_vec_a, d_vec_b, n, d_result);
if (!config.is_a_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
if (!config.is_result_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream));
}
if (!config.is_a_on_device && !is_in_place) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream));
return CHK_LAST();
}
template <typename E>
cudaError_t mul(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
cudaError_t mul(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
{
return vec_op<E, mul_kernel>(vec_a, vec_b, n, config, result);
}
template <typename E>
cudaError_t add(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
cudaError_t add(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
{
return vec_op<E, add_kernel>(vec_a, vec_b, n, config, result);
}
template <typename E>
cudaError_t sub(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
cudaError_t sub(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
{
return vec_op<E, sub_kernel>(vec_a, vec_b, n, config, result);
}

View File

@@ -3,12 +3,13 @@
#include <iostream>
// include list of test files
// Ensure the device_error_test.cu is last to prevent aborting mid-test run
#include "field_test.cu"
#ifdef CURVE_ID
#include "curve_test.cu"
#endif
#include "error_handler_test.cu"
// Ensure the device_error_test.cu is last to prevent aborting mid-test run
#include "device_error_test.cu"
int main(int argc, char** argv)

View File

@@ -83,6 +83,12 @@ pub trait VecOps<F> {
cfg: &VecOpsConfig,
) -> IcicleResult<()>;
fn accumulate(
a: &mut (impl HostOrDeviceSlice<F> + ?Sized),
b: &(impl HostOrDeviceSlice<F> + ?Sized),
cfg: &VecOpsConfig,
) -> IcicleResult<()>;
fn sub(
a: &(impl HostOrDeviceSlice<F> + ?Sized),
b: &(impl HostOrDeviceSlice<F> + ?Sized),
@@ -207,6 +213,19 @@ where
<<F as FieldImpl>::Config as VecOps<F>>::add(a, b, result, &cfg)
}
pub fn accumulate_scalars<F>(
a: &mut (impl HostOrDeviceSlice<F> + ?Sized),
b: &(impl HostOrDeviceSlice<F> + ?Sized),
cfg: &VecOpsConfig,
) -> IcicleResult<()>
where
F: FieldImpl,
<F as FieldImpl>::Config: VecOps<F>,
{
let cfg = check_vec_ops_args(a, b, a, cfg);
<<F as FieldImpl>::Config as VecOps<F>>::accumulate(a, b, &cfg)
}
pub fn sub_scalars<F>(
a: &(impl HostOrDeviceSlice<F> + ?Sized),
b: &(impl HostOrDeviceSlice<F> + ?Sized),
@@ -299,6 +318,14 @@ macro_rules! impl_vec_ops_field {
result: *mut $field,
) -> CudaError;
#[link_name = concat!($field_prefix, "_accumulate_cuda")]
pub(crate) fn accumulate_scalars_cuda(
a: *mut $field,
b: *const $field,
size: u32,
cfg: *const VecOpsConfig,
) -> CudaError;
#[link_name = concat!($field_prefix, "_sub_cuda")]
pub(crate) fn sub_scalars_cuda(
a: *const $field,
@@ -357,6 +384,22 @@ macro_rules! impl_vec_ops_field {
}
}
fn accumulate(
a: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
b: &(impl HostOrDeviceSlice<$field> + ?Sized),
cfg: &VecOpsConfig,
) -> IcicleResult<()> {
unsafe {
$field_prefix_ident::accumulate_scalars_cuda(
a.as_mut_ptr(),
b.as_ptr(),
a.len() as u32,
cfg as *const VecOpsConfig,
)
.wrap()
}
}
fn sub(
a: &(impl HostOrDeviceSlice<$field> + ?Sized),
b: &(impl HostOrDeviceSlice<$field> + ?Sized),
@@ -457,7 +500,7 @@ macro_rules! impl_vec_add_tests {
) => {
#[test]
pub fn test_vec_add_scalars() {
check_vec_ops_scalars::<$field>()
check_vec_ops_scalars::<$field>();
}
#[test]

View File

@@ -5,19 +5,21 @@ use crate::vec_ops::{
};
use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
use super::accumulate_scalars;
pub fn check_vec_ops_scalars<F: FieldImpl>()
where
<F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
{
let test_size = 1 << 14;
let a = F::Config::generate_random(test_size);
let mut a = F::Config::generate_random(test_size);
let b = F::Config::generate_random(test_size);
let ones = vec![F::one(); test_size];
let mut result = vec![F::zero(); test_size];
let mut result2 = vec![F::zero(); test_size];
let mut result3 = vec![F::zero(); test_size];
let a = HostSlice::from_slice(&a);
let a = HostSlice::from_mut_slice(&mut a);
let b = HostSlice::from_slice(&b);
let ones = HostSlice::from_slice(&ones);
let result = HostSlice::from_mut_slice(&mut result);
@@ -34,6 +36,12 @@ where
mul_scalars(a, ones, result3, &cfg).unwrap();
assert_eq!(a[0], result3[0]);
add_scalars(a, b, result, &cfg).unwrap();
accumulate_scalars(a, b, &cfg).unwrap();
assert_eq!(a[0], result[0]);
}
pub fn check_bit_reverse<F: FieldImpl>()