diff --git a/icicle_v3/backend/cpu/src/curve/cpu_msm.cpp b/icicle_v3/backend/cpu/src/curve/cpu_msm.cpp index a42294e4..66b68aa2 100644 --- a/icicle_v3/backend/cpu/src/curve/cpu_msm.cpp +++ b/icicle_v3/backend/cpu/src/curve/cpu_msm.cpp @@ -6,14 +6,124 @@ #include "icicle/curves/projective.h" #include "icicle/curves/curve_config.h" +// #define NDEBUG +#include // TODO remove + using namespace curve_config; using namespace icicle; +// TODO move to test file and add relevant ifdef +class Dummy_Scalar +{ +public: + static constexpr unsigned NBITS = 32; + + unsigned x; + unsigned p = 10; + // unsigned p = 1<<30; + + static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; } + + static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; } + + friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar) + { + os << scalar.x; + return os; + } + + HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const + { + return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1); + } + + friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2) + { + return {(p1.x + p2.x) % p1.p}; + } + + friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); } + + friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); } + + static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; } + static HOST_INLINE Dummy_Scalar rand_host() + { + return {(unsigned)rand() % 10}; + // return {(unsigned)rand()}; + } +}; + +class Dummy_Projective +{ +public: + Dummy_Scalar x; + + static HOST_DEVICE_INLINE Dummy_Projective zero() { return {0}; } + + static HOST_DEVICE_INLINE Dummy_Projective one() { return {1}; } + + // static HOST_DEVICE_INLINE affine_t to_affine(const Dummy_Projective& point) { return {{FF::from(point.x.x)}}; } + + static HOST_DEVICE_INLINE Dummy_Projective from_affine(const affine_t& point) { return {point.x.get_scalar_digit(0,16)}; } + + static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective& point) { return {Dummy_Scalar::neg(point.x)}; } + + friend HOST_DEVICE_INLINE Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2) + { + return {p1.x + p2.x}; + } + + // friend HOST_DEVICE_INLINE Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) { + // return p1 + neg(p2); + // } + + friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point) + { + os << point.x; + return os; + } + + friend HOST_DEVICE_INLINE Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point) + { + Dummy_Projective res = zero(); + for (int i = 0; i < Dummy_Scalar::NBITS; i++) { + if (i > 0) { res = res + res; } + if (scalar.get_scalar_digit(Dummy_Scalar::NBITS - i - 1, 1)) { res = res + point; } + } + return res; + } + + friend HOST_DEVICE_INLINE bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2) + { + return (p1.x == p2.x); + } + + static HOST_DEVICE_INLINE bool is_zero(const Dummy_Projective& point) { return point.x == 0; } + + static HOST_INLINE Dummy_Projective rand_host() + { + return {(unsigned)rand() % 10}; + // return {(unsigned)rand()}; + } +}; + +// typedef scalar_t test_scalar; +// typedef projective_t test_projective; +// typedef affine_t test_affine; + +typedef Dummy_Scalar test_scalar; +typedef Dummy_Projective test_projective; +typedef Dummy_Projective test_affine; + +// TODO ask for help about memory management before / at C.R. +// COMMENT maybe switch to 1d array? uint32_t** msm_bucket_coeffs( const scalar_t* scalars, const unsigned int msm_size, const unsigned int c, - const unsigned int num_windows) + const unsigned int num_windows, + const unsigned int pcf) { /** * Split msm scalars to c-wide coefficients for use in the bucket method @@ -24,25 +134,52 @@ uint32_t** msm_bucket_coeffs( * @param coefficients - output array of the decomposed scalar * @return status of function success / failure in the case of invalid arguments */ - // TODO add check that c divides NBITS - uint32_t** coefficients = new uint32_t*[msm_size]; - for (int i = 0; i < msm_size; i++) + uint32_t** coefficients = new uint32_t*[msm_size*pcf]; + for (int i = 0; i < msm_size*pcf; i++) // TODO split memory initialisation to preprocess { coefficients[i] = new uint32_t[num_windows]; - for (int w = 0; w < num_windows; w++) + std::fill_n(coefficients[i], num_windows, 0); + } + + const int num_full_limbs = scalar_t::NBITS/c; + const int last_limb_bits = scalar_t::NBITS - num_full_limbs * c; + + for (int j = 0; j < msm_size; j++) + { + int count = 0; + bool did_last_limb = false; + for (int i = 0; i < pcf; i++) { - coefficients[i][w] = scalars[i].get_scalar_digit(w, c); + for (int w = 0; w < num_windows; w++) + { + if (count < num_full_limbs) + { + coefficients[msm_size*i + j][w] = scalars[j].get_scalar_digit(num_windows*i + w, c); + } + else + { + // Last window with non-zero data for this coefficient + if (!did_last_limb) coefficients[msm_size*i + j][w] = scalars[j].get_scalar_digit(num_windows*i + w, c) & ((1 << last_limb_bits) - 1); // Remainder is negative + did_last_limb = true; + // Break both loops + i = pcf; + break; + } + count++; + } } } return coefficients; } -projective_t** msm_bucket_accumulator( +template +P** msm_bucket_accumulator( const scalar_t* scalars, const affine_t* bases, const unsigned int c, const unsigned int num_windows, - int msm_size) + const unsigned int msm_size, + const unsigned int pcf) { /** * Accumulate into the different buckets @@ -52,25 +189,37 @@ projective_t** msm_bucket_accumulator( * @param msm_size - number of scalars to add * @param buckets - points array containing all buckets */ - uint32_t** coefficients = msm_bucket_coeffs(scalars, msm_size, c, num_windows); + uint32_t** coefficients = msm_bucket_coeffs(scalars, msm_size, c, num_windows, pcf); uint32_t num_buckets = 1< +P* msm_window_sum( + P** buckets, const unsigned int c, const unsigned int num_windows) { uint32_t num_buckets = 1< 0; i--) { - if (!projective_t::is_zero(buckets[w][i])) partial_sum = partial_sum + buckets[w][i]; - window_sums[w] = window_sums[w] + partial_sum; + if (!P::is_zero(buckets[w][i])) partial_sum = partial_sum + buckets[w][i]; + if (!P::is_zero(partial_sum)) window_sums[w] = window_sums[w] + partial_sum; } } return window_sums; } -projective_t msm_final_sum( - projective_t* window_sums, +template +P msm_final_sum( + P* window_sums, const unsigned int c, const unsigned int num_windows) { - projective_t result = window_sums[num_windows - 1]; + P result = window_sums[num_windows - 1]; for (int w = num_windows - 2; w >= 0; w--) { - for (int dbl = 0; dbl < c; dbl++) - { - result = projective_t::dbl(result); + if (P::is_zero(result)){ + if (!P::is_zero(window_sums[w])) result = P::copy(window_sums[w]); + } + else + { + for (int dbl = 0; dbl < c; dbl++) + { + result = P::dbl(result); + } + if (!P::is_zero(window_sums[w])) result = result + window_sums[w]; } - result = result + window_sums[w]; } return result; } +template void msm_delete_arrays( - projective_t** buckets, - projective_t* windows, + P** buckets, + P* windows, const unsigned int num_windows) { for (int w = 0; w < num_windows; w++) @@ -134,51 +291,92 @@ void msm_delete_arrays( delete[] windows; } -// Double and add +eIcicleError not_supported(const MSMConfig& c) +{ + /** + * Check config for tests that are currently not supported + */ + if (c.batch_size > 1) return eIcicleError::INVALID_ARGUMENT; // TODO add support + if (c.are_scalars_on_device | c.are_points_on_device | c.are_results_on_device) return eIcicleError::INVALID_DEVICE; // COMMENT maybe requires policy change given the possibility of multiple devices on one machine + if (c.are_scalars_montgomery_form | c.are_points_montgomery_form) return eIcicleError::INVALID_ARGUMENT; // TODO add support + if (c.is_async) return eIcicleError::INVALID_DEVICE; //TODO add support + // FIXME fill non-implemented features from MSMConfig + return eIcicleError::SUCCESS; +} + +// Pipenger +template eIcicleError cpu_msm( const Device& device, const scalar_t* scalars, // COMMENT it assumes no negative scalar inputs const affine_t* bases, int msm_size, const MSMConfig& config, - projective_t* results) + P* results) { - const unsigned int c = 15; // TODO integrate into msm config - const int num_windows = (scalar_t::NBITS / c) + ((scalar_t::NBITS % c != 0)? 1 : 0); + // TODO remove at the end + if (not_supported(config) != eIcicleError::SUCCESS) return not_supported(config); + + const unsigned int c = config.ext.get("c"); // TODO integrate into msm config + const unsigned int pcf = config.precompute_factor; + const int num_windows = ((scalar_t::NBITS-1) / (pcf * c)) + 1; + + P** buckets = msm_bucket_accumulator

(scalars, bases, c, num_windows, msm_size, pcf); + P* window_sums = msm_window_sum

(buckets, c, num_windows); + P res = msm_final_sum

(window_sums, c, num_windows); - projective_t** buckets = msm_bucket_accumulator(scalars, bases, c, num_windows, msm_size); - projective_t* window_sums = msm_window_sum(buckets, c, num_windows); - projective_t res = msm_final_sum(window_sums, c, num_windows); - // COMMENT do I need to delete the buckets manually or is it handled automatically when the function finishes? results[0] = res; msm_delete_arrays(buckets, window_sums, num_windows); return eIcicleError::SUCCESS; } +template eIcicleError cpu_msm_ref( const Device& device, const scalar_t* scalars, const affine_t* bases, int msm_size, const MSMConfig& config, - projective_t* results) + P* results) { - projective_t res = projective_t::zero(); + P res = P::zero(); for (auto i = 0; i < msm_size; ++i) { - res = res + projective_t::from_affine(bases[i]) * scalars[i]; + res = res + P::from_affine(bases[i]) * scalars[i]; } return eIcicleError::SUCCESS; } template eIcicleError cpu_msm_precompute_bases( - const Device& device, const A* input_bases, int nof_bases, const MSMConfig& config, A* output_bases) + const Device& device, + const A* input_bases, + int nof_bases, + int precompute_factor, + const MsmPreComputeConfig& config, + A* output_bases) // Pre assigned? { - return eIcicleError::API_NOT_IMPLEMENTED; + const unsigned int c = config.ext.get("c"); + const unsigned int num_windows_no_precomp = (scalar_t::NBITS - 1) / c + 1; + const unsigned int shift = c * ((num_windows_no_precomp - 1) / precompute_factor + 1); + + for (int i = 0; i < nof_bases; i++) + { + projective_t point = projective_t::from_affine(input_bases[i]); + output_bases[i] = input_bases[i]; // COMMENT Should I copy? (not by reference) + for (int j = 1; j < precompute_factor; j++) + { + for (int k = 0; k < shift; k++) + { + point = projective_t::dbl(point); + } + output_bases[nof_bases*j + i] = projective_t::to_affine(point); + } + } + return eIcicleError::SUCCESS; } REGISTER_MSM_PRE_COMPUTE_BASES_BACKEND("CPU", cpu_msm_precompute_bases); -REGISTER_MSM_BACKEND("CPU", (cpu_msm)); +REGISTER_MSM_BACKEND("CPU", cpu_msm); REGISTER_MSM_PRE_COMPUTE_BASES_BACKEND("CPU_REF", cpu_msm_precompute_bases); -REGISTER_MSM_BACKEND("CPU_REF", cpu_msm_ref); +REGISTER_MSM_BACKEND("CPU_REF", cpu_msm_ref); diff --git a/icicle_v3/tests/test_curve_api.cpp b/icicle_v3/tests/test_curve_api.cpp index 258a4d3e..1e7ac705 100644 --- a/icicle_v3/tests/test_curve_api.cpp +++ b/icicle_v3/tests/test_curve_api.cpp @@ -158,23 +158,31 @@ TEST_F(CurveApiTest, ecntt) projective_t::rand_host_many_affine(bases.get(), N); projective_t result_cpu{}; projective_t result_cpu_dbl_n_add{}; - projective_t result_cpu_ref{}; // TODO Yuval should be projective + projective_t result_cpu_ref{}; auto run = [&](const std::string& dev_type, projective_t* out, const char* msg, bool measure, int iters) { Device dev = {dev_type, 0}; icicle_set_device(dev); - auto init_domain_config = default_ntt_init_domain_config(); - ntt_init_domain(scalar_t::omega(logn), init_domain_config); + const int c = 6; + const int pcf = 10; + auto config = default_msm_config(); + config.ext.set("c", c); + config.precompute_factor = pcf; - auto config = default_ntt_config(); + auto pc_config = default_msm_pre_compute_config(); + pc_config.ext.set("c", c); - START_TIMER(NTT_sync) - for (int i = 0; i < iters; ++i) - ntt(input.get(), N, NTTDir::kForward, config, out); - END_TIMER(NTT_sync, msg, measure); + auto precomp_bases = std::make_unique(N*pcf); - ntt_release_domain(); + START_TIMER(MSM_sync) + for (int i = 0; i < iters; ++i) { + // TODO real test + // msm_precompute_bases(bases.get(), N, 1, default_msm_pre_compute_config(), bases.get()); + msm_precompute_bases(bases.get(), N, pcf, pc_config, precomp_bases.get()); + msm(scalars.get(), precomp_bases.get(), N, config, result); + } + END_TIMER(MSM_sync, msg, measure); }; // run("CPU", &result_cpu_dbl_n_add, "CPU msm", false /*=measure*/, 1 /*=iters*/); // warmup