mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-08 23:17:54 -05:00
works with precompute factor
This commit is contained in:
@@ -6,14 +6,124 @@
|
||||
#include "icicle/curves/projective.h"
|
||||
#include "icicle/curves/curve_config.h"
|
||||
|
||||
// #define NDEBUG
|
||||
#include <cassert> // TODO remove
|
||||
|
||||
using namespace curve_config;
|
||||
using namespace icicle;
|
||||
|
||||
// TODO move to test file and add relevant ifdef
|
||||
class Dummy_Scalar
|
||||
{
|
||||
public:
|
||||
static constexpr unsigned NBITS = 32;
|
||||
|
||||
unsigned x;
|
||||
unsigned p = 10;
|
||||
// unsigned p = 1<<30;
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
|
||||
|
||||
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
|
||||
{
|
||||
os << scalar.x;
|
||||
return os;
|
||||
}
|
||||
|
||||
HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
|
||||
{
|
||||
return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
|
||||
{
|
||||
return {(p1.x + p2.x) % p1.p};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
|
||||
static HOST_INLINE Dummy_Scalar rand_host()
|
||||
{
|
||||
return {(unsigned)rand() % 10};
|
||||
// return {(unsigned)rand()};
|
||||
}
|
||||
};
|
||||
|
||||
class Dummy_Projective
|
||||
{
|
||||
public:
|
||||
Dummy_Scalar x;
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Projective zero() { return {0}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Projective one() { return {1}; }
|
||||
|
||||
// static HOST_DEVICE_INLINE affine_t to_affine(const Dummy_Projective& point) { return {{FF::from(point.x.x)}}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Projective from_affine(const affine_t& point) { return {point.x.get_scalar_digit(0,16)}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective& point) { return {Dummy_Scalar::neg(point.x)}; }
|
||||
|
||||
friend HOST_DEVICE_INLINE Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2)
|
||||
{
|
||||
return {p1.x + p2.x};
|
||||
}
|
||||
|
||||
// friend HOST_DEVICE_INLINE Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) {
|
||||
// return p1 + neg(p2);
|
||||
// }
|
||||
|
||||
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point)
|
||||
{
|
||||
os << point.x;
|
||||
return os;
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point)
|
||||
{
|
||||
Dummy_Projective res = zero();
|
||||
for (int i = 0; i < Dummy_Scalar::NBITS; i++) {
|
||||
if (i > 0) { res = res + res; }
|
||||
if (scalar.get_scalar_digit(Dummy_Scalar::NBITS - i - 1, 1)) { res = res + point; }
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2)
|
||||
{
|
||||
return (p1.x == p2.x);
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE bool is_zero(const Dummy_Projective& point) { return point.x == 0; }
|
||||
|
||||
static HOST_INLINE Dummy_Projective rand_host()
|
||||
{
|
||||
return {(unsigned)rand() % 10};
|
||||
// return {(unsigned)rand()};
|
||||
}
|
||||
};
|
||||
|
||||
// typedef scalar_t test_scalar;
|
||||
// typedef projective_t test_projective;
|
||||
// typedef affine_t test_affine;
|
||||
|
||||
typedef Dummy_Scalar test_scalar;
|
||||
typedef Dummy_Projective test_projective;
|
||||
typedef Dummy_Projective test_affine;
|
||||
|
||||
// TODO ask for help about memory management before / at C.R.
|
||||
// COMMENT maybe switch to 1d array?
|
||||
uint32_t** msm_bucket_coeffs(
|
||||
const scalar_t* scalars,
|
||||
const unsigned int msm_size,
|
||||
const unsigned int c,
|
||||
const unsigned int num_windows)
|
||||
const unsigned int num_windows,
|
||||
const unsigned int pcf)
|
||||
{
|
||||
/**
|
||||
* Split msm scalars to c-wide coefficients for use in the bucket method
|
||||
@@ -24,25 +134,52 @@ uint32_t** msm_bucket_coeffs(
|
||||
* @param coefficients - output array of the decomposed scalar
|
||||
* @return status of function success / failure in the case of invalid arguments
|
||||
*/
|
||||
// TODO add check that c divides NBITS
|
||||
uint32_t** coefficients = new uint32_t*[msm_size];
|
||||
for (int i = 0; i < msm_size; i++)
|
||||
uint32_t** coefficients = new uint32_t*[msm_size*pcf];
|
||||
for (int i = 0; i < msm_size*pcf; i++) // TODO split memory initialisation to preprocess
|
||||
{
|
||||
coefficients[i] = new uint32_t[num_windows];
|
||||
for (int w = 0; w < num_windows; w++)
|
||||
std::fill_n(coefficients[i], num_windows, 0);
|
||||
}
|
||||
|
||||
const int num_full_limbs = scalar_t::NBITS/c;
|
||||
const int last_limb_bits = scalar_t::NBITS - num_full_limbs * c;
|
||||
|
||||
for (int j = 0; j < msm_size; j++)
|
||||
{
|
||||
int count = 0;
|
||||
bool did_last_limb = false;
|
||||
for (int i = 0; i < pcf; i++)
|
||||
{
|
||||
coefficients[i][w] = scalars[i].get_scalar_digit(w, c);
|
||||
for (int w = 0; w < num_windows; w++)
|
||||
{
|
||||
if (count < num_full_limbs)
|
||||
{
|
||||
coefficients[msm_size*i + j][w] = scalars[j].get_scalar_digit(num_windows*i + w, c);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Last window with non-zero data for this coefficient
|
||||
if (!did_last_limb) coefficients[msm_size*i + j][w] = scalars[j].get_scalar_digit(num_windows*i + w, c) & ((1 << last_limb_bits) - 1); // Remainder is negative
|
||||
did_last_limb = true;
|
||||
// Break both loops
|
||||
i = pcf;
|
||||
break;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return coefficients;
|
||||
}
|
||||
|
||||
projective_t** msm_bucket_accumulator(
|
||||
template <typename P>
|
||||
P** msm_bucket_accumulator(
|
||||
const scalar_t* scalars,
|
||||
const affine_t* bases,
|
||||
const unsigned int c,
|
||||
const unsigned int num_windows,
|
||||
int msm_size)
|
||||
const unsigned int msm_size,
|
||||
const unsigned int pcf)
|
||||
{
|
||||
/**
|
||||
* Accumulate into the different buckets
|
||||
@@ -52,25 +189,37 @@ projective_t** msm_bucket_accumulator(
|
||||
* @param msm_size - number of scalars to add
|
||||
* @param buckets - points array containing all buckets
|
||||
*/
|
||||
uint32_t** coefficients = msm_bucket_coeffs(scalars, msm_size, c, num_windows);
|
||||
uint32_t** coefficients = msm_bucket_coeffs(scalars, msm_size, c, num_windows, pcf);
|
||||
|
||||
uint32_t num_buckets = 1<<c;
|
||||
projective_t** buckets = new projective_t*[num_windows];
|
||||
P** buckets = new P*[num_windows];
|
||||
|
||||
for (int w = 0; w < num_windows; w++)
|
||||
{
|
||||
buckets[w] = new projective_t[num_buckets]; // COMMENT is it ok to define such a potentially large array?
|
||||
std::fill_n(buckets[w], num_buckets, projective_t::zero());
|
||||
buckets[w] = new P[num_buckets];
|
||||
std::fill_n(buckets[w], num_buckets, P::zero());
|
||||
}
|
||||
|
||||
for (int i = 0; i < msm_size; i++)
|
||||
for (int i = 0; i < pcf; i++)
|
||||
{
|
||||
for (int w = 0; w < num_windows; w++)
|
||||
for (int j = 0; j < msm_size; j++)
|
||||
{
|
||||
if (coefficients[i][w] != 0) buckets[w][coefficients[i][w]] = buckets[w][coefficients[i][w]] + bases[i];
|
||||
for (int w = 0; w < num_windows; w++)
|
||||
{
|
||||
if (coefficients[msm_size*i + j][w] != 0) // TODO 0 will be used for signed version of msm
|
||||
{
|
||||
if (P::is_zero(buckets[w][coefficients[msm_size*i+j][w]]))
|
||||
{
|
||||
buckets[w][coefficients[msm_size*i + j][w]] = P::from_affine(bases[msm_size*i + j]);
|
||||
}
|
||||
else
|
||||
{
|
||||
buckets[w][coefficients[msm_size*i + j][w]] = buckets[w][coefficients[msm_size*i + j][w]] + bases[msm_size*i + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO memory management
|
||||
for (int i = 0; i < msm_size; i++)
|
||||
{
|
||||
delete[] coefficients[i];
|
||||
@@ -80,50 +229,58 @@ projective_t** msm_bucket_accumulator(
|
||||
return buckets;
|
||||
}
|
||||
|
||||
projective_t* msm_window_sum(
|
||||
projective_t** buckets,
|
||||
template <typename P>
|
||||
P* msm_window_sum(
|
||||
P** buckets,
|
||||
const unsigned int c,
|
||||
const unsigned int num_windows)
|
||||
{
|
||||
uint32_t num_buckets = 1<<c; // NOTE implicitly assuming that c<32
|
||||
|
||||
projective_t* window_sums = new projective_t[num_windows];
|
||||
P* window_sums = new P[num_windows];
|
||||
|
||||
for (int w = 0; w < num_windows; w++)
|
||||
{
|
||||
// window_sums[w] = projective_t::copy(buckets[w][num_buckets - 1]); // COMMENT how do I make it copy by value?
|
||||
window_sums[w] = buckets[w][num_buckets - 1];
|
||||
projective_t partial_sum = buckets[w][num_buckets - 1];
|
||||
window_sums[w] = P::copy(buckets[w][num_buckets - 1]);
|
||||
P partial_sum = P::copy(buckets[w][num_buckets - 1]);
|
||||
|
||||
for (int i = num_buckets-2; i > 0; i--)
|
||||
{
|
||||
if (!projective_t::is_zero(buckets[w][i])) partial_sum = partial_sum + buckets[w][i];
|
||||
window_sums[w] = window_sums[w] + partial_sum;
|
||||
if (!P::is_zero(buckets[w][i])) partial_sum = partial_sum + buckets[w][i];
|
||||
if (!P::is_zero(partial_sum)) window_sums[w] = window_sums[w] + partial_sum;
|
||||
}
|
||||
}
|
||||
return window_sums;
|
||||
}
|
||||
|
||||
projective_t msm_final_sum(
|
||||
projective_t* window_sums,
|
||||
template <typename P>
|
||||
P msm_final_sum(
|
||||
P* window_sums,
|
||||
const unsigned int c,
|
||||
const unsigned int num_windows)
|
||||
{
|
||||
projective_t result = window_sums[num_windows - 1];
|
||||
P result = window_sums[num_windows - 1];
|
||||
for (int w = num_windows - 2; w >= 0; w--)
|
||||
{
|
||||
for (int dbl = 0; dbl < c; dbl++)
|
||||
{
|
||||
result = projective_t::dbl(result);
|
||||
if (P::is_zero(result)){
|
||||
if (!P::is_zero(window_sums[w])) result = P::copy(window_sums[w]);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int dbl = 0; dbl < c; dbl++)
|
||||
{
|
||||
result = P::dbl(result);
|
||||
}
|
||||
if (!P::is_zero(window_sums[w])) result = result + window_sums[w];
|
||||
}
|
||||
result = result + window_sums[w];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename P>
|
||||
void msm_delete_arrays(
|
||||
projective_t** buckets,
|
||||
projective_t* windows,
|
||||
P** buckets,
|
||||
P* windows,
|
||||
const unsigned int num_windows)
|
||||
{
|
||||
for (int w = 0; w < num_windows; w++)
|
||||
@@ -134,51 +291,92 @@ void msm_delete_arrays(
|
||||
delete[] windows;
|
||||
}
|
||||
|
||||
// Double and add
|
||||
eIcicleError not_supported(const MSMConfig& c)
|
||||
{
|
||||
/**
|
||||
* Check config for tests that are currently not supported
|
||||
*/
|
||||
if (c.batch_size > 1) return eIcicleError::INVALID_ARGUMENT; // TODO add support
|
||||
if (c.are_scalars_on_device | c.are_points_on_device | c.are_results_on_device) return eIcicleError::INVALID_DEVICE; // COMMENT maybe requires policy change given the possibility of multiple devices on one machine
|
||||
if (c.are_scalars_montgomery_form | c.are_points_montgomery_form) return eIcicleError::INVALID_ARGUMENT; // TODO add support
|
||||
if (c.is_async) return eIcicleError::INVALID_DEVICE; //TODO add support
|
||||
// FIXME fill non-implemented features from MSMConfig
|
||||
return eIcicleError::SUCCESS;
|
||||
}
|
||||
|
||||
// Pipenger
|
||||
template <typename P>
|
||||
eIcicleError cpu_msm(
|
||||
const Device& device,
|
||||
const scalar_t* scalars, // COMMENT it assumes no negative scalar inputs
|
||||
const affine_t* bases,
|
||||
int msm_size,
|
||||
const MSMConfig& config,
|
||||
projective_t* results)
|
||||
P* results)
|
||||
{
|
||||
const unsigned int c = 15; // TODO integrate into msm config
|
||||
const int num_windows = (scalar_t::NBITS / c) + ((scalar_t::NBITS % c != 0)? 1 : 0);
|
||||
// TODO remove at the end
|
||||
if (not_supported(config) != eIcicleError::SUCCESS) return not_supported(config);
|
||||
|
||||
const unsigned int c = config.ext.get<int>("c"); // TODO integrate into msm config
|
||||
const unsigned int pcf = config.precompute_factor;
|
||||
const int num_windows = ((scalar_t::NBITS-1) / (pcf * c)) + 1;
|
||||
|
||||
P** buckets = msm_bucket_accumulator<P>(scalars, bases, c, num_windows, msm_size, pcf);
|
||||
P* window_sums = msm_window_sum<P>(buckets, c, num_windows);
|
||||
P res = msm_final_sum<P>(window_sums, c, num_windows);
|
||||
|
||||
projective_t** buckets = msm_bucket_accumulator(scalars, bases, c, num_windows, msm_size);
|
||||
projective_t* window_sums = msm_window_sum(buckets, c, num_windows);
|
||||
projective_t res = msm_final_sum(window_sums, c, num_windows);
|
||||
// COMMENT do I need to delete the buckets manually or is it handled automatically when the function finishes?
|
||||
results[0] = res;
|
||||
msm_delete_arrays(buckets, window_sums, num_windows);
|
||||
return eIcicleError::SUCCESS;
|
||||
}
|
||||
|
||||
template <typename P>
|
||||
eIcicleError cpu_msm_ref(
|
||||
const Device& device,
|
||||
const scalar_t* scalars,
|
||||
const affine_t* bases,
|
||||
int msm_size,
|
||||
const MSMConfig& config,
|
||||
projective_t* results)
|
||||
P* results)
|
||||
{
|
||||
projective_t res = projective_t::zero();
|
||||
P res = P::zero();
|
||||
for (auto i = 0; i < msm_size; ++i) {
|
||||
res = res + projective_t::from_affine(bases[i]) * scalars[i];
|
||||
res = res + P::from_affine(bases[i]) * scalars[i];
|
||||
}
|
||||
return eIcicleError::SUCCESS;
|
||||
}
|
||||
|
||||
template <typename A>
|
||||
eIcicleError cpu_msm_precompute_bases(
|
||||
const Device& device, const A* input_bases, int nof_bases, const MSMConfig& config, A* output_bases)
|
||||
const Device& device,
|
||||
const A* input_bases,
|
||||
int nof_bases,
|
||||
int precompute_factor,
|
||||
const MsmPreComputeConfig& config,
|
||||
A* output_bases) // Pre assigned?
|
||||
{
|
||||
return eIcicleError::API_NOT_IMPLEMENTED;
|
||||
const unsigned int c = config.ext.get<int>("c");
|
||||
const unsigned int num_windows_no_precomp = (scalar_t::NBITS - 1) / c + 1;
|
||||
const unsigned int shift = c * ((num_windows_no_precomp - 1) / precompute_factor + 1);
|
||||
|
||||
for (int i = 0; i < nof_bases; i++)
|
||||
{
|
||||
projective_t point = projective_t::from_affine(input_bases[i]);
|
||||
output_bases[i] = input_bases[i]; // COMMENT Should I copy? (not by reference)
|
||||
for (int j = 1; j < precompute_factor; j++)
|
||||
{
|
||||
for (int k = 0; k < shift; k++)
|
||||
{
|
||||
point = projective_t::dbl(point);
|
||||
}
|
||||
output_bases[nof_bases*j + i] = projective_t::to_affine(point);
|
||||
}
|
||||
}
|
||||
return eIcicleError::SUCCESS;
|
||||
}
|
||||
|
||||
REGISTER_MSM_PRE_COMPUTE_BASES_BACKEND("CPU", cpu_msm_precompute_bases<affine_t>);
|
||||
REGISTER_MSM_BACKEND("CPU", (cpu_msm));
|
||||
REGISTER_MSM_BACKEND("CPU", cpu_msm<projective_t>);
|
||||
|
||||
REGISTER_MSM_PRE_COMPUTE_BASES_BACKEND("CPU_REF", cpu_msm_precompute_bases<affine_t>);
|
||||
REGISTER_MSM_BACKEND("CPU_REF", cpu_msm_ref);
|
||||
REGISTER_MSM_BACKEND("CPU_REF", cpu_msm_ref<projective_t>);
|
||||
|
||||
@@ -158,23 +158,31 @@ TEST_F(CurveApiTest, ecntt)
|
||||
projective_t::rand_host_many_affine(bases.get(), N);
|
||||
projective_t result_cpu{};
|
||||
projective_t result_cpu_dbl_n_add{};
|
||||
projective_t result_cpu_ref{}; // TODO Yuval should be projective
|
||||
projective_t result_cpu_ref{};
|
||||
|
||||
auto run = [&](const std::string& dev_type, projective_t* out, const char* msg, bool measure, int iters) {
|
||||
Device dev = {dev_type, 0};
|
||||
icicle_set_device(dev);
|
||||
|
||||
auto init_domain_config = default_ntt_init_domain_config();
|
||||
ntt_init_domain(scalar_t::omega(logn), init_domain_config);
|
||||
const int c = 6;
|
||||
const int pcf = 10;
|
||||
auto config = default_msm_config();
|
||||
config.ext.set("c", c);
|
||||
config.precompute_factor = pcf;
|
||||
|
||||
auto config = default_ntt_config<scalar_t>();
|
||||
auto pc_config = default_msm_pre_compute_config();
|
||||
pc_config.ext.set("c", c);
|
||||
|
||||
START_TIMER(NTT_sync)
|
||||
for (int i = 0; i < iters; ++i)
|
||||
ntt(input.get(), N, NTTDir::kForward, config, out);
|
||||
END_TIMER(NTT_sync, msg, measure);
|
||||
auto precomp_bases = std::make_unique<affine_t[]>(N*pcf);
|
||||
|
||||
ntt_release_domain<scalar_t>();
|
||||
START_TIMER(MSM_sync)
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
// TODO real test
|
||||
// msm_precompute_bases(bases.get(), N, 1, default_msm_pre_compute_config(), bases.get());
|
||||
msm_precompute_bases(bases.get(), N, pcf, pc_config, precomp_bases.get());
|
||||
msm(scalars.get(), precomp_bases.get(), N, config, result);
|
||||
}
|
||||
END_TIMER(MSM_sync, msg, measure);
|
||||
};
|
||||
|
||||
// run("CPU", &result_cpu_dbl_n_add, "CPU msm", false /*=measure*/, 1 /*=iters*/); // warmup
|
||||
|
||||
Reference in New Issue
Block a user