diff --git a/.github/workflows/cpp_cuda.yml b/.github/workflows/cpp_cuda.yml index 5fda8fc4..bb57823a 100644 --- a/.github/workflows/cpp_cuda.yml +++ b/.github/workflows/cpp_cuda.yml @@ -73,6 +73,8 @@ jobs: build_args: -DEXT_FIELD=ON - name: stark252 build_args: -DEXT_FIELD=OFF + - name: m31 + build_args: -DEXT_FIELD=ON steps: - name: Checkout Repo uses: actions/checkout@v4 diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index cc80fda9..09b3d206 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -62,8 +62,8 @@ jobs: # We need to limit the number of threads to avoid running out of memory on weaker machines # ignored tests are polynomial tests. Since they conflict with NTT tests, they are executed separately run: | - cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --release --verbose --features=g2 -- --test-threads=2 --ignored - cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --release --verbose --features=g2 -- --test-threads=2 + cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --exclude icicle-m31 --release --verbose --features=g2 -- --test-threads=2 --ignored + cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --exclude icicle-m31 --release --verbose --features=g2 -- --test-threads=2 - name: Run baby bear tests working-directory: ./wrappers/rust/icicle-fields/icicle-babybear @@ -79,6 +79,13 @@ jobs: cargo test --release --verbose -- --ignored cargo test --release --verbose + - name: Run m31 tests + working-directory: ./wrappers/rust/icicle-fields/icicle-m31 + if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true' + run: | + cargo test --release --verbose -- --ignored + cargo test --release --verbose + # build-windows: # name: Build on Windows # runs-on: windows-2022 diff --git a/icicle/cmake/FieldsCommon.cmake b/icicle/cmake/FieldsCommon.cmake index 8b76f32b..e38f420c 100644 --- a/icicle/cmake/FieldsCommon.cmake +++ b/icicle/cmake/FieldsCommon.cmake @@ -1,5 +1,5 @@ function(check_field) - set(SUPPORTED_FIELDS babybear;stark252) + set(SUPPORTED_FIELDS babybear;stark252;m31) set(IS_FIELD_SUPPORTED FALSE) set(I 1000) diff --git a/icicle/include/api/babybear.h b/icicle/include/api/babybear.h index 8bb27f4b..e152e4c5 100644 --- a/icicle/include/api/babybear.h +++ b/icicle/include/api/babybear.h @@ -19,6 +19,74 @@ extern "C" cudaError_t babybear_extension_ntt_cuda( const babybear::extension_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, babybear::extension_t* output); +extern "C" cudaError_t babybear_initialize_domain( + babybear::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); + +extern "C" cudaError_t babybear_ntt_cuda( + const babybear::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, babybear::scalar_t* output); + +extern "C" cudaError_t babybear_release_domain(device_context::DeviceContext& ctx); + +extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size); + +extern "C" cudaError_t babybear_scalar_convert_montgomery( + babybear::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + +extern "C" cudaError_t babybear_extension_mul_cuda( + babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result); + +extern "C" cudaError_t babybear_extension_add_cuda( + babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result); + +extern "C" cudaError_t babybear_extension_accumulate_cuda( + babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config); + +extern "C" cudaError_t babybear_extension_sub_cuda( + babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result); + +extern "C" cudaError_t babybear_extension_transpose_matrix_cuda( + const babybear::extension_t* input, + uint32_t row_size, + uint32_t column_size, + babybear::extension_t* output, + device_context::DeviceContext& ctx, + bool on_device, + bool is_async); + +extern "C" cudaError_t babybear_extension_bit_reverse_cuda( + const babybear::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, babybear::extension_t* output); + + +extern "C" void babybear_extension_generate_scalars(babybear::extension_t* scalars, int size); + +extern "C" cudaError_t babybear_extension_scalar_convert_montgomery( + babybear::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + +extern "C" cudaError_t babybear_mul_cuda( + babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result); + +extern "C" cudaError_t babybear_add_cuda( + babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result); + +extern "C" cudaError_t babybear_accumulate_cuda( + babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + +extern "C" cudaError_t babybear_sub_cuda( + babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result); + +extern "C" cudaError_t babybear_transpose_matrix_cuda( + const babybear::scalar_t* input, + uint32_t row_size, + uint32_t column_size, + babybear::scalar_t* output, + device_context::DeviceContext& ctx, + bool on_device, + bool is_async); + +extern "C" cudaError_t babybear_bit_reverse_cuda( + const babybear::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, babybear::scalar_t* output); + + extern "C" cudaError_t babybear_create_poseidon2_constants_cuda( int width, int alpha, @@ -50,67 +118,4 @@ extern "C" cudaError_t babybear_release_poseidon2_constants_cuda( poseidon2::Poseidon2Constants* constants, device_context::DeviceContext& ctx); -extern "C" cudaError_t babybear_mul_cuda( - babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result); - -extern "C" cudaError_t babybear_add_cuda( - babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result); - -extern "C" cudaError_t babybear_accumulate_cuda( - babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); - -extern "C" cudaError_t babybear_sub_cuda( - babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result); - -extern "C" cudaError_t babybear_transpose_matrix_cuda( - const babybear::scalar_t* input, - uint32_t row_size, - uint32_t column_size, - babybear::scalar_t* output, - device_context::DeviceContext& ctx, - bool on_device, - bool is_async); - -extern "C" cudaError_t babybear_bit_reverse_cuda( - const babybear::scalar_t* input, - uint64_t n, - vec_ops::BitReverseConfig& config, - babybear::scalar_t* output); - -extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size); - -extern "C" cudaError_t babybear_scalar_convert_montgomery( - babybear::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t babybear_initialize_domain( - babybear::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); - -extern "C" cudaError_t babybear_ntt_cuda( - const babybear::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, babybear::scalar_t* output); - -extern "C" cudaError_t babybear_release_domain(device_context::DeviceContext& ctx); - -extern "C" void babybear_extension_generate_scalars(babybear::extension_t* scalars, int size); - -extern "C" cudaError_t babybear_extension_scalar_convert_montgomery( - babybear::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t babybear_extension_mul_cuda( - babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result); - -extern "C" cudaError_t babybear_extension_add_cuda( - babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result); - -extern "C" cudaError_t babybear_extension_sub_cuda( - babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result); - -extern "C" cudaError_t babybear_extension_transpose_matrix_cuda( - const babybear::extension_t* input, - uint32_t row_size, - uint32_t column_size, - babybear::extension_t* output, - device_context::DeviceContext& ctx, - bool on_device, - bool is_async); - #endif \ No newline at end of file diff --git a/icicle/include/api/bls12_377.h b/icicle/include/api/bls12_377.h index bf6f8d1a..f6c2a7c3 100644 --- a/icicle/include/api/bls12_377.h +++ b/icicle/include/api/bls12_377.h @@ -16,6 +16,20 @@ #include "poseidon/poseidon.cuh" #include "poseidon/tree/merkle.cuh" +extern "C" bool bls12_377_g2_eq(bls12_377::g2_projective_t* point1, bls12_377::g2_projective_t* point2); + +extern "C" void bls12_377_g2_to_affine(bls12_377::g2_projective_t* point, bls12_377::g2_affine_t* point_out); + +extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective_t* points, int size); + +extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size); + +extern "C" cudaError_t bls12_377_g2_affine_convert_montgomery( + bls12_377::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + +extern "C" cudaError_t bls12_377_g2_projective_convert_montgomery( + bls12_377::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + extern "C" cudaError_t bls12_377_g2_precompute_msm_bases_cuda( bls12_377::g2_affine_t* bases, int msm_size, @@ -34,20 +48,6 @@ extern "C" cudaError_t bls12_377_precompute_msm_bases_cuda( extern "C" cudaError_t bls12_377_msm_cuda( const bls12_377::scalar_t* scalars, const bls12_377::affine_t* points, int msm_size, msm::MSMConfig& config, bls12_377::projective_t* out); -extern "C" bool bls12_377_g2_eq(bls12_377::g2_projective_t* point1, bls12_377::g2_projective_t* point2); - -extern "C" void bls12_377_g2_to_affine(bls12_377::g2_projective_t* point, bls12_377::g2_affine_t* point_out); - -extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective_t* points, int size); - -extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size); - -extern "C" cudaError_t bls12_377_g2_affine_convert_montgomery( - bls12_377::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t bls12_377_g2_projective_convert_montgomery( - bls12_377::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - extern "C" cudaError_t bls12_377_ecntt_cuda( const bls12_377::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bls12_377::projective_t* output); @@ -65,6 +65,44 @@ extern "C" cudaError_t bls12_377_affine_convert_montgomery( extern "C" cudaError_t bls12_377_projective_convert_montgomery( bls12_377::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); +extern "C" cudaError_t bls12_377_initialize_domain( + bls12_377::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); + +extern "C" cudaError_t bls12_377_ntt_cuda( + const bls12_377::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bls12_377::scalar_t* output); + +extern "C" cudaError_t bls12_377_release_domain(device_context::DeviceContext& ctx); + +extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size); + +extern "C" cudaError_t bls12_377_scalar_convert_montgomery( + bls12_377::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + +extern "C" cudaError_t bls12_377_mul_cuda( + bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result); + +extern "C" cudaError_t bls12_377_add_cuda( + bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result); + +extern "C" cudaError_t bls12_377_accumulate_cuda( + bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + +extern "C" cudaError_t bls12_377_sub_cuda( + bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result); + +extern "C" cudaError_t bls12_377_transpose_matrix_cuda( + const bls12_377::scalar_t* input, + uint32_t row_size, + uint32_t column_size, + bls12_377::scalar_t* output, + device_context::DeviceContext& ctx, + bool on_device, + bool is_async); + +extern "C" cudaError_t bls12_377_bit_reverse_cuda( + const bls12_377::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bls12_377::scalar_t* output); + + extern "C" cudaError_t bls12_377_create_optimized_poseidon_constants_cuda( int arity, int full_rounds_half, @@ -92,44 +130,4 @@ extern "C" cudaError_t bls12_377_build_poseidon_merkle_tree( poseidon::PoseidonConstants& constants, merkle::TreeBuilderConfig& config); -extern "C" cudaError_t bls12_377_mul_cuda( - bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result); - -extern "C" cudaError_t bls12_377_add_cuda( - bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result); - -extern "C" cudaError_t bls12_377_accumulate_cuda( - bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); - -extern "C" cudaError_t bls12_377_sub_cuda( - bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result); - -extern "C" cudaError_t bls12_377_transpose_matrix_cuda( - const bls12_377::scalar_t* input, - uint32_t row_size, - uint32_t column_size, - bls12_377::scalar_t* output, - device_context::DeviceContext& ctx, - bool on_device, - bool is_async); - -extern "C" cudaError_t bls12_377_bit_reverse_cuda( - const bls12_377::scalar_t* input, - uint64_t n, - vec_ops::BitReverseConfig& config, - bls12_377::scalar_t* output); - -extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size); - -extern "C" cudaError_t bls12_377_scalar_convert_montgomery( - bls12_377::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t bls12_377_initialize_domain( - bls12_377::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); - -extern "C" cudaError_t bls12_377_ntt_cuda( - const bls12_377::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bls12_377::scalar_t* output); - -extern "C" cudaError_t bls12_377_release_domain(device_context::DeviceContext& ctx); - #endif \ No newline at end of file diff --git a/icicle/include/api/bls12_381.h b/icicle/include/api/bls12_381.h index 8972349b..0a3e2904 100644 --- a/icicle/include/api/bls12_381.h +++ b/icicle/include/api/bls12_381.h @@ -16,6 +16,20 @@ #include "poseidon/poseidon.cuh" #include "poseidon/tree/merkle.cuh" +extern "C" bool bls12_381_g2_eq(bls12_381::g2_projective_t* point1, bls12_381::g2_projective_t* point2); + +extern "C" void bls12_381_g2_to_affine(bls12_381::g2_projective_t* point, bls12_381::g2_affine_t* point_out); + +extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective_t* points, int size); + +extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size); + +extern "C" cudaError_t bls12_381_g2_affine_convert_montgomery( + bls12_381::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + +extern "C" cudaError_t bls12_381_g2_projective_convert_montgomery( + bls12_381::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + extern "C" cudaError_t bls12_381_g2_precompute_msm_bases_cuda( bls12_381::g2_affine_t* bases, int msm_size, @@ -34,20 +48,6 @@ extern "C" cudaError_t bls12_381_precompute_msm_bases_cuda( extern "C" cudaError_t bls12_381_msm_cuda( const bls12_381::scalar_t* scalars, const bls12_381::affine_t* points, int msm_size, msm::MSMConfig& config, bls12_381::projective_t* out); -extern "C" bool bls12_381_g2_eq(bls12_381::g2_projective_t* point1, bls12_381::g2_projective_t* point2); - -extern "C" void bls12_381_g2_to_affine(bls12_381::g2_projective_t* point, bls12_381::g2_affine_t* point_out); - -extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective_t* points, int size); - -extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size); - -extern "C" cudaError_t bls12_381_g2_affine_convert_montgomery( - bls12_381::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t bls12_381_g2_projective_convert_montgomery( - bls12_381::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - extern "C" cudaError_t bls12_381_ecntt_cuda( const bls12_381::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bls12_381::projective_t* output); @@ -65,6 +65,44 @@ extern "C" cudaError_t bls12_381_affine_convert_montgomery( extern "C" cudaError_t bls12_381_projective_convert_montgomery( bls12_381::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); +extern "C" cudaError_t bls12_381_initialize_domain( + bls12_381::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); + +extern "C" cudaError_t bls12_381_ntt_cuda( + const bls12_381::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bls12_381::scalar_t* output); + +extern "C" cudaError_t bls12_381_release_domain(device_context::DeviceContext& ctx); + +extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size); + +extern "C" cudaError_t bls12_381_scalar_convert_montgomery( + bls12_381::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + +extern "C" cudaError_t bls12_381_mul_cuda( + bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result); + +extern "C" cudaError_t bls12_381_add_cuda( + bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result); + +extern "C" cudaError_t bls12_381_accumulate_cuda( + bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + +extern "C" cudaError_t bls12_381_sub_cuda( + bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result); + +extern "C" cudaError_t bls12_381_transpose_matrix_cuda( + const bls12_381::scalar_t* input, + uint32_t row_size, + uint32_t column_size, + bls12_381::scalar_t* output, + device_context::DeviceContext& ctx, + bool on_device, + bool is_async); + +extern "C" cudaError_t bls12_381_bit_reverse_cuda( + const bls12_381::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bls12_381::scalar_t* output); + + extern "C" cudaError_t bls12_381_create_optimized_poseidon_constants_cuda( int arity, int full_rounds_half, @@ -92,44 +130,4 @@ extern "C" cudaError_t bls12_381_build_poseidon_merkle_tree( poseidon::PoseidonConstants& constants, merkle::TreeBuilderConfig& config); -extern "C" cudaError_t bls12_381_mul_cuda( - bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result); - -extern "C" cudaError_t bls12_381_add_cuda( - bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result); - -extern "C" cudaError_t bls12_381_accumulate_cuda( - bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); - -extern "C" cudaError_t bls12_381_sub_cuda( - bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result); - -extern "C" cudaError_t bls12_381_transpose_matrix_cuda( - const bls12_381::scalar_t* input, - uint32_t row_size, - uint32_t column_size, - bls12_381::scalar_t* output, - device_context::DeviceContext& ctx, - bool on_device, - bool is_async); - -extern "C" cudaError_t bls12_381_bit_reverse_cuda( - const bls12_381::scalar_t* input, - uint64_t n, - vec_ops::BitReverseConfig& config, - bls12_381::scalar_t* output); - -extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size); - -extern "C" cudaError_t bls12_381_scalar_convert_montgomery( - bls12_381::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t bls12_381_initialize_domain( - bls12_381::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); - -extern "C" cudaError_t bls12_381_ntt_cuda( - const bls12_381::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bls12_381::scalar_t* output); - -extern "C" cudaError_t bls12_381_release_domain(device_context::DeviceContext& ctx); - #endif \ No newline at end of file diff --git a/icicle/include/api/bn254.h b/icicle/include/api/bn254.h index 20d8e6d1..c11b8a30 100644 --- a/icicle/include/api/bn254.h +++ b/icicle/include/api/bn254.h @@ -17,6 +17,20 @@ #include "poseidon/tree/merkle.cuh" #include "poseidon2/poseidon2.cuh" +extern "C" bool bn254_g2_eq(bn254::g2_projective_t* point1, bn254::g2_projective_t* point2); + +extern "C" void bn254_g2_to_affine(bn254::g2_projective_t* point, bn254::g2_affine_t* point_out); + +extern "C" void bn254_g2_generate_projective_points(bn254::g2_projective_t* points, int size); + +extern "C" void bn254_g2_generate_affine_points(bn254::g2_affine_t* points, int size); + +extern "C" cudaError_t bn254_g2_affine_convert_montgomery( + bn254::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + +extern "C" cudaError_t bn254_g2_projective_convert_montgomery( + bn254::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + extern "C" cudaError_t bn254_g2_precompute_msm_bases_cuda( bn254::g2_affine_t* bases, int msm_size, @@ -35,20 +49,6 @@ extern "C" cudaError_t bn254_precompute_msm_bases_cuda( extern "C" cudaError_t bn254_msm_cuda( const bn254::scalar_t* scalars, const bn254::affine_t* points, int msm_size, msm::MSMConfig& config, bn254::projective_t* out); -extern "C" bool bn254_g2_eq(bn254::g2_projective_t* point1, bn254::g2_projective_t* point2); - -extern "C" void bn254_g2_to_affine(bn254::g2_projective_t* point, bn254::g2_affine_t* point_out); - -extern "C" void bn254_g2_generate_projective_points(bn254::g2_projective_t* points, int size); - -extern "C" void bn254_g2_generate_affine_points(bn254::g2_affine_t* points, int size); - -extern "C" cudaError_t bn254_g2_affine_convert_montgomery( - bn254::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t bn254_g2_projective_convert_montgomery( - bn254::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - extern "C" cudaError_t bn254_ecntt_cuda( const bn254::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bn254::projective_t* output); @@ -66,6 +66,44 @@ extern "C" cudaError_t bn254_affine_convert_montgomery( extern "C" cudaError_t bn254_projective_convert_montgomery( bn254::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); +extern "C" cudaError_t bn254_initialize_domain( + bn254::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); + +extern "C" cudaError_t bn254_ntt_cuda( + const bn254::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bn254::scalar_t* output); + +extern "C" cudaError_t bn254_release_domain(device_context::DeviceContext& ctx); + +extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size); + +extern "C" cudaError_t bn254_scalar_convert_montgomery( + bn254::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + +extern "C" cudaError_t bn254_mul_cuda( + bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result); + +extern "C" cudaError_t bn254_add_cuda( + bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result); + +extern "C" cudaError_t bn254_accumulate_cuda( + bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + +extern "C" cudaError_t bn254_sub_cuda( + bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result); + +extern "C" cudaError_t bn254_transpose_matrix_cuda( + const bn254::scalar_t* input, + uint32_t row_size, + uint32_t column_size, + bn254::scalar_t* output, + device_context::DeviceContext& ctx, + bool on_device, + bool is_async); + +extern "C" cudaError_t bn254_bit_reverse_cuda( + const bn254::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bn254::scalar_t* output); + + extern "C" cudaError_t bn254_create_poseidon2_constants_cuda( int width, int alpha, @@ -124,44 +162,4 @@ extern "C" cudaError_t bn254_build_poseidon_merkle_tree( poseidon::PoseidonConstants& constants, merkle::TreeBuilderConfig& config); -extern "C" cudaError_t bn254_mul_cuda( - bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result); - -extern "C" cudaError_t bn254_add_cuda( - bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result); - -extern "C" cudaError_t bn254_accumulate_cuda( - bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); - -extern "C" cudaError_t bn254_sub_cuda( - bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result); - -extern "C" cudaError_t bn254_transpose_matrix_cuda( - const bn254::scalar_t* input, - uint32_t row_size, - uint32_t column_size, - bn254::scalar_t* output, - device_context::DeviceContext& ctx, - bool on_device, - bool is_async); - -extern "C" cudaError_t bn254_bit_reverse_cuda( - const bn254::scalar_t* input, - uint64_t n, - vec_ops::BitReverseConfig& config, - bn254::scalar_t* output); - -extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size); - -extern "C" cudaError_t bn254_scalar_convert_montgomery( - bn254::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t bn254_initialize_domain( - bn254::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); - -extern "C" cudaError_t bn254_ntt_cuda( - const bn254::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bn254::scalar_t* output); - -extern "C" cudaError_t bn254_release_domain(device_context::DeviceContext& ctx); - #endif \ No newline at end of file diff --git a/icicle/include/api/bw6_761.h b/icicle/include/api/bw6_761.h index 8d072218..931d2d6c 100644 --- a/icicle/include/api/bw6_761.h +++ b/icicle/include/api/bw6_761.h @@ -16,6 +16,20 @@ #include "poseidon/poseidon.cuh" #include "poseidon/tree/merkle.cuh" +extern "C" bool bw6_761_g2_eq(bw6_761::g2_projective_t* point1, bw6_761::g2_projective_t* point2); + +extern "C" void bw6_761_g2_to_affine(bw6_761::g2_projective_t* point, bw6_761::g2_affine_t* point_out); + +extern "C" void bw6_761_g2_generate_projective_points(bw6_761::g2_projective_t* points, int size); + +extern "C" void bw6_761_g2_generate_affine_points(bw6_761::g2_affine_t* points, int size); + +extern "C" cudaError_t bw6_761_g2_affine_convert_montgomery( + bw6_761::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + +extern "C" cudaError_t bw6_761_g2_projective_convert_montgomery( + bw6_761::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + extern "C" cudaError_t bw6_761_g2_precompute_msm_bases_cuda( bw6_761::g2_affine_t* bases, int msm_size, @@ -34,20 +48,6 @@ extern "C" cudaError_t bw6_761_precompute_msm_bases_cuda( extern "C" cudaError_t bw6_761_msm_cuda( const bw6_761::scalar_t* scalars, const bw6_761::affine_t* points, int msm_size, msm::MSMConfig& config, bw6_761::projective_t* out); -extern "C" bool bw6_761_g2_eq(bw6_761::g2_projective_t* point1, bw6_761::g2_projective_t* point2); - -extern "C" void bw6_761_g2_to_affine(bw6_761::g2_projective_t* point, bw6_761::g2_affine_t* point_out); - -extern "C" void bw6_761_g2_generate_projective_points(bw6_761::g2_projective_t* points, int size); - -extern "C" void bw6_761_g2_generate_affine_points(bw6_761::g2_affine_t* points, int size); - -extern "C" cudaError_t bw6_761_g2_affine_convert_montgomery( - bw6_761::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t bw6_761_g2_projective_convert_montgomery( - bw6_761::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - extern "C" cudaError_t bw6_761_ecntt_cuda( const bw6_761::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bw6_761::projective_t* output); @@ -65,6 +65,44 @@ extern "C" cudaError_t bw6_761_affine_convert_montgomery( extern "C" cudaError_t bw6_761_projective_convert_montgomery( bw6_761::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); +extern "C" cudaError_t bw6_761_initialize_domain( + bw6_761::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); + +extern "C" cudaError_t bw6_761_ntt_cuda( + const bw6_761::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bw6_761::scalar_t* output); + +extern "C" cudaError_t bw6_761_release_domain(device_context::DeviceContext& ctx); + +extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size); + +extern "C" cudaError_t bw6_761_scalar_convert_montgomery( + bw6_761::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + +extern "C" cudaError_t bw6_761_mul_cuda( + bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result); + +extern "C" cudaError_t bw6_761_add_cuda( + bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result); + +extern "C" cudaError_t bw6_761_accumulate_cuda( + bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + +extern "C" cudaError_t bw6_761_sub_cuda( + bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result); + +extern "C" cudaError_t bw6_761_transpose_matrix_cuda( + const bw6_761::scalar_t* input, + uint32_t row_size, + uint32_t column_size, + bw6_761::scalar_t* output, + device_context::DeviceContext& ctx, + bool on_device, + bool is_async); + +extern "C" cudaError_t bw6_761_bit_reverse_cuda( + const bw6_761::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bw6_761::scalar_t* output); + + extern "C" cudaError_t bw6_761_create_optimized_poseidon_constants_cuda( int arity, int full_rounds_half, @@ -92,44 +130,4 @@ extern "C" cudaError_t bw6_761_build_poseidon_merkle_tree( poseidon::PoseidonConstants& constants, merkle::TreeBuilderConfig& config); -extern "C" cudaError_t bw6_761_mul_cuda( - bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result); - -extern "C" cudaError_t bw6_761_add_cuda( - bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result); - -extern "C" cudaError_t bw6_761_accumulate_cuda( - bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); - -extern "C" cudaError_t bw6_761_sub_cuda( - bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result); - -extern "C" cudaError_t bw6_761_transpose_matrix_cuda( - const bw6_761::scalar_t* input, - uint32_t row_size, - uint32_t column_size, - bw6_761::scalar_t* output, - device_context::DeviceContext& ctx, - bool on_device, - bool is_async); - -extern "C" cudaError_t bw6_761_bit_reverse_cuda( - const bw6_761::scalar_t* input, - uint64_t n, - vec_ops::BitReverseConfig& config, - bw6_761::scalar_t* output); - -extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size); - -extern "C" cudaError_t bw6_761_scalar_convert_montgomery( - bw6_761::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t bw6_761_initialize_domain( - bw6_761::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); - -extern "C" cudaError_t bw6_761_ntt_cuda( - const bw6_761::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bw6_761::scalar_t* output); - -extern "C" cudaError_t bw6_761_release_domain(device_context::DeviceContext& ctx); - #endif \ No newline at end of file diff --git a/icicle/include/api/grumpkin.h b/icicle/include/api/grumpkin.h index 9caaddb1..5690100c 100644 --- a/icicle/include/api/grumpkin.h +++ b/icicle/include/api/grumpkin.h @@ -38,6 +38,36 @@ extern "C" cudaError_t grumpkin_affine_convert_montgomery( extern "C" cudaError_t grumpkin_projective_convert_montgomery( grumpkin::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); +extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size); + +extern "C" cudaError_t grumpkin_scalar_convert_montgomery( + grumpkin::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + +extern "C" cudaError_t grumpkin_mul_cuda( + grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result); + +extern "C" cudaError_t grumpkin_add_cuda( + grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result); + +extern "C" cudaError_t grumpkin_accumulate_cuda( + grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + +extern "C" cudaError_t grumpkin_sub_cuda( + grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result); + +extern "C" cudaError_t grumpkin_transpose_matrix_cuda( + const grumpkin::scalar_t* input, + uint32_t row_size, + uint32_t column_size, + grumpkin::scalar_t* output, + device_context::DeviceContext& ctx, + bool on_device, + bool is_async); + +extern "C" cudaError_t grumpkin_bit_reverse_cuda( + const grumpkin::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, grumpkin::scalar_t* output); + + extern "C" cudaError_t grumpkin_create_optimized_poseidon_constants_cuda( int arity, int full_rounds_half, @@ -65,36 +95,4 @@ extern "C" cudaError_t grumpkin_build_poseidon_merkle_tree( poseidon::PoseidonConstants& constants, merkle::TreeBuilderConfig& config); -extern "C" cudaError_t grumpkin_mul_cuda( - grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result); - -extern "C" cudaError_t grumpkin_add_cuda( - grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result); - -extern "C" cudaError_t grumpkin_accumulate_cuda( - grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); - -extern "C" cudaError_t grumpkin_sub_cuda( - grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result); - -extern "C" cudaError_t grumpkin_transpose_matrix_cuda( - const grumpkin::scalar_t* input, - uint32_t row_size, - uint32_t column_size, - grumpkin::scalar_t* output, - device_context::DeviceContext& ctx, - bool on_device, - bool is_async); - -extern "C" cudaError_t grumpkin_bit_reverse_cuda( - const grumpkin::scalar_t* input, - uint64_t n, - vec_ops::BitReverseConfig& config, - grumpkin::scalar_t* output); - -extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size); - -extern "C" cudaError_t grumpkin_scalar_convert_montgomery( - grumpkin::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - #endif \ No newline at end of file diff --git a/icicle/include/api/m31.h b/icicle/include/api/m31.h new file mode 100644 index 00000000..9105cf9e --- /dev/null +++ b/icicle/include/api/m31.h @@ -0,0 +1,75 @@ +// WARNING: This file is auto-generated by a script. +// Any changes made to this file may be overwritten. +// Please modify the code generation script instead. +// Path to the code generation script: scripts/gen_c_api.py + +#pragma once +#ifndef M31_API_H +#define M31_API_H + +#include +#include "gpu-utils/device_context.cuh" +#include "fields/stark_fields/m31.cuh" +#include "vec_ops/vec_ops.cuh" + +extern "C" void m31_generate_scalars(m31::scalar_t* scalars, int size); + +extern "C" cudaError_t m31_scalar_convert_montgomery( + m31::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + +extern "C" cudaError_t m31_extension_mul_cuda( + m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result); + +extern "C" cudaError_t m31_extension_add_cuda( + m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result); + +extern "C" cudaError_t m31_extension_accumulate_cuda( + m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config); + +extern "C" cudaError_t m31_extension_sub_cuda( + m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result); + +extern "C" cudaError_t m31_extension_transpose_matrix_cuda( + const m31::extension_t* input, + uint32_t row_size, + uint32_t column_size, + m31::extension_t* output, + device_context::DeviceContext& ctx, + bool on_device, + bool is_async); + +extern "C" cudaError_t m31_extension_bit_reverse_cuda( + const m31::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, m31::extension_t* output); + + +extern "C" void m31_extension_generate_scalars(m31::extension_t* scalars, int size); + +extern "C" cudaError_t m31_extension_scalar_convert_montgomery( + m31::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + +extern "C" cudaError_t m31_mul_cuda( + m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::scalar_t* result); + +extern "C" cudaError_t m31_add_cuda( + m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::scalar_t* result); + +extern "C" cudaError_t m31_accumulate_cuda( + m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + +extern "C" cudaError_t m31_sub_cuda( + m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::scalar_t* result); + +extern "C" cudaError_t m31_transpose_matrix_cuda( + const m31::scalar_t* input, + uint32_t row_size, + uint32_t column_size, + m31::scalar_t* output, + device_context::DeviceContext& ctx, + bool on_device, + bool is_async); + +extern "C" cudaError_t m31_bit_reverse_cuda( + const m31::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, m31::scalar_t* output); + + +#endif \ No newline at end of file diff --git a/icicle/include/api/stark252.h b/icicle/include/api/stark252.h index f6248af7..b8bef75b 100644 --- a/icicle/include/api/stark252.h +++ b/icicle/include/api/stark252.h @@ -13,6 +13,19 @@ #include "ntt/ntt.cuh" #include "vec_ops/vec_ops.cuh" +extern "C" cudaError_t stark252_initialize_domain( + stark252::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); + +extern "C" cudaError_t stark252_ntt_cuda( + const stark252::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, stark252::scalar_t* output); + +extern "C" cudaError_t stark252_release_domain(device_context::DeviceContext& ctx); + +extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size); + +extern "C" cudaError_t stark252_scalar_convert_montgomery( + stark252::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); + extern "C" cudaError_t stark252_mul_cuda( stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result); @@ -35,22 +48,7 @@ extern "C" cudaError_t stark252_transpose_matrix_cuda( bool is_async); extern "C" cudaError_t stark252_bit_reverse_cuda( - const stark252::scalar_t* input, - uint64_t n, - vec_ops::BitReverseConfig& config, - stark252::scalar_t* output); + const stark252::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, stark252::scalar_t* output); -extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size); - -extern "C" cudaError_t stark252_scalar_convert_montgomery( - stark252::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t stark252_initialize_domain( - stark252::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); - -extern "C" cudaError_t stark252_ntt_cuda( - const stark252::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, stark252::scalar_t* output); - -extern "C" cudaError_t stark252_release_domain(device_context::DeviceContext& ctx); #endif \ No newline at end of file diff --git a/icicle/include/api/templates/fields/vec_ops.h b/icicle/include/api/templates/fields/vec_ops.h index 8cfa4bde..7fe1c9be 100644 --- a/icicle/include/api/templates/fields/vec_ops.h +++ b/icicle/include/api/templates/fields/vec_ops.h @@ -17,4 +17,7 @@ extern "C" cudaError_t ${FIELD}_transpose_matrix_cuda( ${FIELD}::scalar_t* output, device_context::DeviceContext& ctx, bool on_device, - bool is_async); \ No newline at end of file + bool is_async); + +extern "C" cudaError_t ${FIELD}_bit_reverse_cuda( + const ${FIELD}::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, ${FIELD}::scalar_t* output); diff --git a/icicle/include/api/templates/fields/vec_ops_ext.h b/icicle/include/api/templates/fields/vec_ops_ext.h index d2bc2bd3..0266e9cc 100644 --- a/icicle/include/api/templates/fields/vec_ops_ext.h +++ b/icicle/include/api/templates/fields/vec_ops_ext.h @@ -17,4 +17,7 @@ extern "C" cudaError_t ${FIELD}_extension_transpose_matrix_cuda( ${FIELD}::extension_t* output, device_context::DeviceContext& ctx, bool on_device, - bool is_async); \ No newline at end of file + bool is_async); + +extern "C" cudaError_t ${FIELD}_extension_bit_reverse_cuda( + const ${FIELD}::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, ${FIELD}::extension_t* output); diff --git a/icicle/include/curves/macro.h b/icicle/include/curves/macro.h index 9f25ed28..6ce3cb66 100644 --- a/icicle/include/curves/macro.h +++ b/icicle/include/curves/macro.h @@ -22,7 +22,7 @@ typedef Affine affine_t; #define G2_CURVE_DEFINITIONS \ - typedef ExtensionField g2_point_field_t; \ + typedef ExtensionField g2_point_field_t; \ static constexpr g2_point_field_t g2_generator_x = \ g2_point_field_t{point_field_t{g2_gen_x_re}, point_field_t{g2_gen_x_im}}; \ static constexpr g2_point_field_t g2_generator_y = \ diff --git a/icicle/include/fields/field_config.cuh b/icicle/include/fields/field_config.cuh index 51729b64..d9ec18c0 100644 --- a/icicle/include/fields/field_config.cuh +++ b/icicle/include/fields/field_config.cuh @@ -33,6 +33,9 @@ namespace field_config = babybear; #elif FIELD_ID == STARK_252 #include "fields/stark_fields/stark252.cuh" namespace field_config = stark252; +#elif FIELD_ID == M31 +#include "fields/stark_fields/m31.cuh" +namespace field_config = m31; #endif #endif \ No newline at end of file diff --git a/icicle/include/fields/id.h b/icicle/include/fields/id.h index 691858ce..4c017c4d 100644 --- a/icicle/include/fields/id.h +++ b/icicle/include/fields/id.h @@ -10,5 +10,6 @@ #define BABY_BEAR 1001 #define STARK_252 1002 +#define M31 1003 #endif \ No newline at end of file diff --git a/icicle/include/fields/quadratic_extension.cuh b/icicle/include/fields/quadratic_extension.cuh index 36249a39..10065386 100644 --- a/icicle/include/fields/quadratic_extension.cuh +++ b/icicle/include/fields/quadratic_extension.cuh @@ -4,13 +4,13 @@ #include "gpu-utils/modifiers.cuh" #include "gpu-utils/sharedmem.cuh" -template +template class ExtensionField { private: - friend Field; + friend T; - typedef typename Field::Wide FWide; + typedef typename T::Wide FWide; struct ExtensionWide { FWide real; @@ -28,7 +28,7 @@ private: }; public: - typedef Field FF; + typedef T FF; static constexpr unsigned TLC = 2 * CONFIG::limbs_count; FF real; @@ -196,11 +196,11 @@ public: } }; -template -struct SharedMemory> { - __device__ ExtensionField* getPointer() +template +struct SharedMemory> { + __device__ ExtensionField* getPointer() { - extern __shared__ ExtensionField s_ext2_scalar_[]; + extern __shared__ ExtensionField s_ext2_scalar_[]; return s_ext2_scalar_; } }; \ No newline at end of file diff --git a/icicle/include/fields/quartic_extension.cuh b/icicle/include/fields/quartic_extension.cuh index 1f73adc8..8fead58c 100644 --- a/icicle/include/fields/quartic_extension.cuh +++ b/icicle/include/fields/quartic_extension.cuh @@ -4,11 +4,11 @@ #include "gpu-utils/modifiers.cuh" #include "gpu-utils/sharedmem.cuh" -template +template class ExtensionField { private: - typedef typename Field::Wide FWide; + typedef typename T::Wide FWide; struct ExtensionWide { FWide real; @@ -28,7 +28,7 @@ private: }; public: - typedef Field FF; + typedef T FF; static constexpr unsigned TLC = 4 * CONFIG::limbs_count; FF real; @@ -49,15 +49,14 @@ public: static constexpr HOST_DEVICE_INLINE ExtensionField to_montgomery(const ExtensionField& xs) { return ExtensionField{ - xs.real * FF{CONFIG::montgomery_r}, xs.im1 * FF{CONFIG::montgomery_r}, xs.im2 * FF{CONFIG::montgomery_r}, - xs.im3 * FF{CONFIG::montgomery_r}}; + FF::to_montgomery(xs.real), FF::to_montgomery(xs.im1), FF::to_montgomery(xs.im2), FF::to_montgomery(xs.im3)}; } static constexpr HOST_DEVICE_INLINE ExtensionField from_montgomery(const ExtensionField& xs) { return ExtensionField{ - xs.real * FF{CONFIG::montgomery_r_inv}, xs.im1 * FF{CONFIG::montgomery_r_inv}, - xs.im2 * FF{CONFIG::montgomery_r_inv}, xs.im3 * FF{CONFIG::montgomery_r_inv}}; + FF::from_montgomery(xs.real), FF::from_montgomery(xs.im1), FF::from_montgomery(xs.im2), + FF::from_montgomery(xs.im3)}; } static HOST_INLINE ExtensionField rand_host() @@ -247,11 +246,11 @@ public: } }; -template -struct SharedMemory> { - __device__ ExtensionField* getPointer() +template +struct SharedMemory> { + __device__ ExtensionField* getPointer() { - extern __shared__ ExtensionField s_ext4_scalar_[]; + extern __shared__ ExtensionField s_ext4_scalar_[]; return s_ext4_scalar_; } }; \ No newline at end of file diff --git a/icicle/include/fields/stark_fields/babybear.cuh b/icicle/include/fields/stark_fields/babybear.cuh index ad3db0e2..6463893f 100644 --- a/icicle/include/fields/stark_fields/babybear.cuh +++ b/icicle/include/fields/stark_fields/babybear.cuh @@ -58,5 +58,5 @@ namespace babybear { /** * Extension field of `scalar_t` enabled if `-DEXT_FIELD` env variable is. */ - typedef ExtensionField extension_t; + typedef ExtensionField extension_t; } // namespace babybear diff --git a/icicle/include/fields/stark_fields/m31.cuh b/icicle/include/fields/stark_fields/m31.cuh new file mode 100644 index 00000000..03f74fd9 --- /dev/null +++ b/icicle/include/fields/stark_fields/m31.cuh @@ -0,0 +1,224 @@ +#pragma once + +#include "fields/storage.cuh" +#include "fields/field.cuh" +#include "fields/quartic_extension.cuh" + +namespace m31 { + template + class MersenneField : public Field + { + public: + HOST_DEVICE_INLINE MersenneField(const MersenneField& other) : Field(other) {} + HOST_DEVICE_INLINE MersenneField(const uint32_t& x = 0) : Field({x}) {} + HOST_DEVICE_INLINE MersenneField(storage x) : Field{x} {} + HOST_DEVICE_INLINE MersenneField(const Field& other) : Field(other) {} + + static constexpr HOST_DEVICE_INLINE MersenneField zero() { return MersenneField(CONFIG::zero.limbs[0]); } + + static constexpr HOST_DEVICE_INLINE MersenneField one() { return MersenneField(CONFIG::one.limbs[0]); } + + static constexpr HOST_DEVICE_INLINE MersenneField from(uint32_t value) { return MersenneField(value); } + + static HOST_INLINE MersenneField rand_host() { return MersenneField(Field::rand_host()); } + + static void rand_host_many(MersenneField* out, int size) + { + for (int i = 0; i < size; i++) + out[i] = rand_host(); + } + + HOST_DEVICE_INLINE MersenneField& operator=(const Field& other) + { + if (this != &other) { Field::operator=(other); } + return *this; + } + + HOST_DEVICE_INLINE uint32_t get_limb() const { return this->limbs_storage.limbs[0]; } + + // The `Wide` struct represents a redundant 32-bit form of the Mersenne Field. + struct Wide { + uint32_t storage; + static constexpr HOST_DEVICE_INLINE Wide from_field(const MersenneField& xs) + { + Wide out{}; + out.storage = xs.get_limb(); + return out; + } + static constexpr HOST_DEVICE_INLINE Wide from_number(const uint32_t& xs) + { + Wide out{}; + out.storage = xs; + return out; + } + friend HOST_DEVICE_INLINE Wide operator+(Wide xs, const Wide& ys) + { + uint64_t tmp = (uint64_t)xs.storage + ys.storage; // max: 2^33 - 2 = 2^32(1) + (2^32 - 2) + tmp = ((tmp >> 32) << 1) + (uint32_t)(tmp); // 2(1)+(2^32-2) = 2^32(1)+(0) + return from_number((uint32_t)((tmp >> 32) << 1) + (uint32_t)(tmp)); // max: 2(1) + 0 = 2 + } + friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys) + { + uint64_t tmp = CONFIG::modulus_3 + xs.storage - + ys.storage; // max: 3(2^31-1) + 2^32-1 - 0 = 2^33 + 2^31-4 = 2^32(2) + (2^31-4) + return from_number(((uint32_t)(tmp >> 32) << 1) + (uint32_t)(tmp)); // max: 2(2)+(2^31-4) = 2^31 + } + template + static constexpr HOST_DEVICE_INLINE Wide neg(const Wide& xs) + { + uint64_t tmp = CONFIG::modulus_3 - xs.storage; // max: 3(2^31-1) - 0 = 2^32(1) + (2^31 - 3) + return from_number(((uint32_t)(tmp >> 32) << 1) + (uint32_t)(tmp)); // max: 2(1)+(2^31-3) = 2^31 - 1 + } + friend HOST_DEVICE_INLINE Wide operator*(Wide xs, const Wide& ys) + { + uint64_t t1 = (uint64_t)xs.storage * ys.storage; // max: 2^64 - 2^33+1 = 2^32(2^32 - 2) + 1 + t1 = ((t1 >> 32) << 1) + (uint32_t)(t1); // max: 2(2^32 - 2) + 1 = 2^32(1) + (2^32 - 3) + return from_number((((uint32_t)(t1 >> 32)) << 1) + (uint32_t)(t1)); // max: 2(1) - (2^32 - 3) = 2^32 - 1 + } + }; + + static constexpr HOST_DEVICE_INLINE MersenneField div2(const MersenneField& xs, const uint32_t& power = 1) + { + uint32_t t = xs.get_limb(); + return MersenneField{{((t >> power) | (t << (31 - power))) & MersenneField::get_modulus().limbs[0]}}; + } + + static constexpr HOST_DEVICE_INLINE MersenneField neg(const MersenneField& xs) + { + uint32_t t = xs.get_limb(); + return MersenneField{{t == 0 ? t : MersenneField::get_modulus().limbs[0] - t}}; + } + + template + static constexpr HOST_DEVICE_INLINE MersenneField reduce(Wide xs) + { + const uint32_t modulus = MersenneField::get_modulus().limbs[0]; + uint32_t tmp = (xs.storage >> 31) + (xs.storage & modulus); // max: 1 + 2^31-1 = 2^31 + tmp = (xs.storage >> 31) + (xs.storage & modulus); // max: 1 + 0 = 1 + return MersenneField{{tmp == modulus ? 0 : tmp}}; + } + + static constexpr HOST_DEVICE_INLINE MersenneField inverse(const MersenneField& x) + { + uint32_t xs = x.limbs_storage.limbs[0]; + if (xs <= 1) return xs; + uint32_t a = 1, b = 0, y = xs, z = MersenneField::get_modulus().limbs[0], e, m = z; + while (1) { +#ifdef __CUDA_ARCH__ + e = __ffs(y) - 1; +#else + e = __builtin_ctz(y); +#endif + y >>= e; + if (a >= m) { + a = (a & m) + (a >> 31); + if (a == m) a = 0; + } + a = ((a >> e) | (a << (31 - e))) & m; + if (y == 1) return a; + e = a + b; + b = a; + a = e; + e = y + z; + z = y; + y = e; + } + } + + friend HOST_DEVICE_INLINE MersenneField operator+(MersenneField xs, const MersenneField& ys) + { + uint32_t m = MersenneField::get_modulus().limbs[0]; + uint32_t t = xs.get_limb() + ys.get_limb(); + if (t > m) t = (t & m) + (t >> 31); + if (t == m) t = 0; + return MersenneField{{t}}; + } + + friend HOST_DEVICE_INLINE MersenneField operator-(MersenneField xs, const MersenneField& ys) + { + return xs + neg(ys); + } + + friend HOST_DEVICE_INLINE MersenneField operator*(MersenneField xs, const MersenneField& ys) + { + uint64_t x = (uint64_t)(xs.get_limb()) * ys.get_limb(); + uint32_t t = ((x >> 31) + (x & MersenneField::get_modulus().limbs[0])); + uint32_t m = MersenneField::get_modulus().limbs[0]; + if (t > m) t = (t & m) + (t >> 31); + if (t > m) t = (t & m) + (t >> 31); + if (t == m) t = 0; + return MersenneField{{t}}; + } + + static constexpr HOST_DEVICE_INLINE Wide mul_wide(const MersenneField& xs, const MersenneField& ys) + { + return Wide::from_field(xs) * Wide::from_field(ys); + } + + template + static constexpr HOST_DEVICE_INLINE Wide sqr_wide(const MersenneField& xs) + { + return mul_wide(xs, xs); + } + + static constexpr HOST_DEVICE_INLINE MersenneField sqr(const MersenneField& xs) { return xs * xs; } + + static constexpr HOST_DEVICE_INLINE MersenneField to_montgomery(const MersenneField& xs) { return xs; } + + static constexpr HOST_DEVICE_INLINE MersenneField from_montgomery(const MersenneField& xs) { return xs; } + + static constexpr HOST_DEVICE_INLINE MersenneField pow(MersenneField base, int exp) + { + MersenneField res = one(); + while (exp > 0) { + if (exp & 1) res = res * base; + base = base * base; + exp >>= 1; + } + return res; + } + }; + struct fp_config { + static constexpr unsigned limbs_count = 1; + static constexpr unsigned omegas_count = 1; + static constexpr unsigned modulus_bit_count = 31; + static constexpr unsigned num_of_reductions = 1; + + static constexpr storage modulus = {0x7fffffff}; + static constexpr storage modulus_2 = {0xfffffffe}; + static constexpr uint64_t modulus_3 = 0x17ffffffd; + static constexpr storage modulus_4 = {0xfffffffc}; + static constexpr storage neg_modulus = {0x87ffffff}; + static constexpr storage<2 * limbs_count> modulus_wide = {0x7fffffff, 0x00000000}; + static constexpr storage<2 * limbs_count> modulus_squared = {0x00000001, 0x3fffffff}; + static constexpr storage<2 * limbs_count> modulus_squared_2 = {0x00000002, 0x7ffffffe}; + static constexpr storage<2 * limbs_count> modulus_squared_4 = {0x00000004, 0xfffffffc}; + + static constexpr storage m = {0x80000001}; + static constexpr storage one = {0x00000001}; + static constexpr storage zero = {0x00000000}; + static constexpr storage montgomery_r = {0x00000001}; + static constexpr storage montgomery_r_inv = {0x00000001}; + + static constexpr storage_array omega = {{{0x7ffffffe}}}; + + static constexpr storage_array omega_inv = {{{0x7ffffffe}}}; + + static constexpr storage_array inv = {{{0x40000000}}}; + + // nonresidue to generate the extension field + static constexpr uint32_t nonresidue = 11; + // true if nonresidue is negative. + static constexpr bool nonresidue_is_negative = false; + }; + + /** + * Scalar field. Is always a prime field. + */ + typedef MersenneField scalar_t; + + /** + * Extension field of `scalar_t` enabled if `-DEXT_FIELD` env variable is. + */ + typedef ExtensionField extension_t; +} // namespace m31 diff --git a/icicle/src/fields/CMakeLists.txt b/icicle/src/fields/CMakeLists.txt index cb16de1d..82b5e15f 100644 --- a/icicle/src/fields/CMakeLists.txt +++ b/icicle/src/fields/CMakeLists.txt @@ -2,8 +2,8 @@ if (EXT_FIELD) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DEXT_FIELD") endif () -SET(SUPPORTED_FIELDS_WITHOUT_NTT grumpkin) -SET(SUPPORTED_FIELDS_WITHOUT_POSEIDON2 bls12_381;bls12_377;grumpkin;bw6_761;stark252) +SET(SUPPORTED_FIELDS_WITHOUT_NTT grumpkin;m31) +SET(SUPPORTED_FIELDS_WITHOUT_POSEIDON2 bls12_381;bls12_377;grumpkin;bw6_761;stark252;m31) set(TARGET icicle_field) @@ -13,7 +13,9 @@ set(FIELD_SOURCE ${SRC}/fields/extern.cu) list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern.cu) if(EXT_FIELD) list(APPEND FIELD_SOURCE ${SRC}/fields/extern_extension.cu) - list(APPEND FIELD_SOURCE ${SRC}/ntt/extern_extension.cu) + if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT) + list(APPEND FIELD_SOURCE ${SRC}/ntt/extern_extension.cu) + endif() list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern_extension.cu) endif() diff --git a/icicle/tests/CMakeLists.txt b/icicle/tests/CMakeLists.txt index 212e5852..e904710a 100644 --- a/icicle/tests/CMakeLists.txt +++ b/icicle/tests/CMakeLists.txt @@ -16,7 +16,7 @@ target_link_libraries(runner GTest::gtest_main) set_target_properties(runner PROPERTIES CUDA_SEPARABLE_COMPILATION ON) # polynomial test-bench -set(POLY_UNSUPPORTED_FIELD grumpkin) +set(POLY_UNSUPPORTED_FIELD grumpkin;m31) if (NOT FIELD IN_LIST POLY_UNSUPPORTED_FIELD) add_executable(polynomial_tb polynomial_test.cu) target_link_libraries(polynomial_tb GTest::gtest_main icicle_field pthread) diff --git a/scripts/gen_c_api.py b/scripts/gen_c_api.py index 1457a6b6..26817a44 100755 --- a/scripts/gen_c_api.py +++ b/scripts/gen_c_api.py @@ -64,6 +64,12 @@ FIELDS_CONFIG = { "field_ext.h", "vec_ops_ext.h", "ntt_ext.h", + }, + "m31": { + "ntt_ext.h", + "ntt.h", + "poseidon.h", + "poseidon2.h", } } diff --git a/scripts/hooks/pre-push b/scripts/hooks/pre-push index 2a639298..cec55f9b 100755 --- a/scripts/hooks/pre-push +++ b/scripts/hooks/pre-push @@ -23,6 +23,8 @@ then echo "find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -style=file" echo "" status=1 +else + echo "🟩 Icicle Core format is fine" fi # Run go fmt across all Golang packages @@ -32,6 +34,8 @@ then echo "Please commit the formatted files" echo "" status=1 +else + echo "🟩 Golang files format is fine" fi # Run cargo fmt on Rust files @@ -42,6 +46,8 @@ then echo "Please go to wrappers/rust and format the Rust files using the following command:" echo "find . -path ./icicle-curves/icicle-curve-template -prune -o -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --" status=1 +else + echo "🟩 Rust files format is fine" fi exit $status \ No newline at end of file diff --git a/wrappers/rust/Cargo.toml b/wrappers/rust/Cargo.toml index ee272f46..ba04c83b 100644 --- a/wrappers/rust/Cargo.toml +++ b/wrappers/rust/Cargo.toml @@ -9,6 +9,7 @@ members = [ "icicle-curves/icicle-bn254", "icicle-curves/icicle-grumpkin", "icicle-fields/icicle-babybear", + "icicle-fields/icicle-m31", "icicle-fields/icicle-stark252", "icicle-hash", ] diff --git a/wrappers/rust/icicle-fields/icicle-m31/Cargo.toml b/wrappers/rust/icicle-fields/icicle-m31/Cargo.toml new file mode 100644 index 00000000..51b5d68f --- /dev/null +++ b/wrappers/rust/icicle-fields/icicle-m31/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "icicle-m31" +version.workspace = true +edition.workspace = true +authors.workspace = true +description = "Rust wrapper for the CUDA implementation of m31 prime field by Ingonyama" +homepage.workspace = true +repository.workspace = true + +[dependencies] +icicle-core = { workspace = true } +icicle-cuda-runtime = { workspace = true } + +[build-dependencies] +cmake = "0.1.50" + +[features] +default = [] +devmode = ["icicle-core/devmode"] diff --git a/wrappers/rust/icicle-fields/icicle-m31/build.rs b/wrappers/rust/icicle-fields/icicle-m31/build.rs new file mode 100644 index 00000000..5663eaf0 --- /dev/null +++ b/wrappers/rust/icicle-fields/icicle-m31/build.rs @@ -0,0 +1,29 @@ +use cmake::Config; +use std::env; + +fn main() { + println!("cargo:rerun-if-env-changed=CXXFLAGS"); + println!("cargo:rerun-if-changed=../../../../icicle"); + + // Base config + let mut config = Config::new("../../../../icicle/"); + config + .define("FIELD", "m31") + .define("CMAKE_BUILD_TYPE", "Release") + .define("EXT_FIELD", "ON"); + + if let Ok(cuda_arch) = env::var("CUDA_ARCH") { + config.define("CUDA_ARCH", &cuda_arch); + } + + // Build + let out_dir = config + .build_target("icicle_field") + .build(); + + println!("cargo:rustc-link-search={}/build/lib", out_dir.display()); + + println!("cargo:rustc-link-lib=ingo_field_m31"); + println!("cargo:rustc-link-lib=stdc++"); + println!("cargo:rustc-link-lib=cudart"); +} diff --git a/wrappers/rust/icicle-fields/icicle-m31/src/field.rs b/wrappers/rust/icicle-fields/icicle-m31/src/field.rs new file mode 100644 index 00000000..068515ad --- /dev/null +++ b/wrappers/rust/icicle-fields/icicle-m31/src/field.rs @@ -0,0 +1,33 @@ +use icicle_core::field::{Field, MontgomeryConvertibleField}; +use icicle_core::traits::{FieldConfig, FieldImpl, GenerateRandom}; +use icicle_core::{impl_field, impl_scalar_field}; +use icicle_cuda_runtime::device::check_device; +use icicle_cuda_runtime::device_context::DeviceContext; +use icicle_cuda_runtime::error::CudaError; +use icicle_cuda_runtime::memory::{DeviceSlice, HostOrDeviceSlice}; + +pub(crate) const SCALAR_LIMBS: usize = 1; +pub(crate) const EXTENSION_LIMBS: usize = 4; + +impl_scalar_field!("m31", m31, SCALAR_LIMBS, ScalarField, ScalarCfg, Fr); +impl_scalar_field!( + "m31_extension", + m31_extension, + EXTENSION_LIMBS, + ExtensionField, + ExtensionCfg, + Fr +); +#[cfg(test)] +mod tests { + use super::{ExtensionField, ScalarField}; + use icicle_core::impl_field_tests; + use icicle_core::tests::*; + + impl_field_tests!(ScalarField); + mod extension { + use super::*; + + impl_field_tests!(ExtensionField); + } +} diff --git a/wrappers/rust/icicle-fields/icicle-m31/src/lib.rs b/wrappers/rust/icicle-fields/icicle-m31/src/lib.rs new file mode 100644 index 00000000..001f51ba --- /dev/null +++ b/wrappers/rust/icicle-fields/icicle-m31/src/lib.rs @@ -0,0 +1,2 @@ +pub mod field; +pub mod vec_ops; diff --git a/wrappers/rust/icicle-fields/icicle-m31/src/vec_ops/mod.rs b/wrappers/rust/icicle-fields/icicle-m31/src/vec_ops/mod.rs new file mode 100644 index 00000000..ecf6b5b0 --- /dev/null +++ b/wrappers/rust/icicle-fields/icicle-m31/src/vec_ops/mod.rs @@ -0,0 +1,26 @@ +use crate::field::{ExtensionCfg, ExtensionField, ScalarCfg, ScalarField}; + +use icicle_core::error::IcicleResult; +use icicle_core::impl_vec_ops_field; +use icicle_core::traits::IcicleResultWrap; +use icicle_core::vec_ops::{BitReverseConfig, VecOps, VecOpsConfig}; +use icicle_cuda_runtime::device_context::DeviceContext; +use icicle_cuda_runtime::error::CudaError; +use icicle_cuda_runtime::memory::HostOrDeviceSlice; + +impl_vec_ops_field!("m31", m31, ScalarField, ScalarCfg); +impl_vec_ops_field!("m31_extension", m31_extension, ExtensionField, ExtensionCfg); + +#[cfg(test)] +pub(crate) mod tests { + use crate::field::{ExtensionField, ScalarField}; + use icicle_core::impl_vec_add_tests; + use icicle_core::vec_ops::tests::*; + + impl_vec_add_tests!(ScalarField); + mod extension { + use super::*; + + impl_vec_add_tests!(ExtensionField); + } +}