mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-14 09:58:02 -05:00
Compare commits
8 Commits
msm/precom
...
backend_mo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
42cffb1c88 | ||
|
|
d3274a9eaa | ||
|
|
d31a7019fe | ||
|
|
84a0d3c348 | ||
|
|
eb87970325 | ||
|
|
a9081aabbf | ||
|
|
b564c6670d | ||
|
|
1f9f3f13ea |
@@ -119,6 +119,7 @@ This will ensure our custom hooks are run and will make it easier to follow our
|
||||
- [nonam3e](https://github.com/nonam3e), for adding Grumpkin curve support into ICICLE
|
||||
- [alxiong](https://github.com/alxiong), for adding warmup for CudaStream
|
||||
- [cyl19970726](https://github.com/cyl19970726), for updating go install source in Dockerfile
|
||||
- [PatStiles](https://github.com/PatStiles), for adding Stark252 field
|
||||
|
||||
## Help & Support
|
||||
|
||||
|
||||
@@ -88,7 +88,7 @@ void point_near_x(T x, affine_t *point) {
|
||||
}
|
||||
|
||||
static int seed = 0;
|
||||
static HOST_INLINE T rand_host_seed()
|
||||
static T rand_host_seed()
|
||||
{
|
||||
std::mt19937_64 generator(seed++);
|
||||
std::uniform_int_distribution<unsigned> distribution;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "gpu-utils/sharedmem.cuh"
|
||||
#include "gpu-utils/modifiers.cuh"
|
||||
#include "../gpu-utils/sharedmem.cuh"
|
||||
#include "../gpu-utils/modifiers.cuh"
|
||||
#include <iostream>
|
||||
|
||||
template <class FF>
|
||||
@@ -11,26 +11,26 @@ public:
|
||||
FF x;
|
||||
FF y;
|
||||
|
||||
static HOST_DEVICE_INLINE Affine neg(const Affine& point) { return {point.x, FF::neg(point.y)}; }
|
||||
static Affine neg(const Affine& point) { return {point.x, FF::neg(point.y)}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Affine zero() { return {FF::zero(), FF::zero()}; }
|
||||
static Affine zero() { return {FF::zero(), FF::zero()}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Affine to_montgomery(const Affine& point)
|
||||
static Affine to_montgomery(const Affine& point)
|
||||
{
|
||||
return {FF::to_montgomery(point.x), FF::to_montgomery(point.y)};
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE Affine from_montgomery(const Affine& point)
|
||||
static Affine from_montgomery(const Affine& point)
|
||||
{
|
||||
return {FF::from_montgomery(point.x), FF::from_montgomery(point.y)};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Affine& xs, const Affine& ys)
|
||||
friend bool operator==(const Affine& xs, const Affine& ys)
|
||||
{
|
||||
return (xs.x == ys.x) && (xs.y == ys.y);
|
||||
}
|
||||
|
||||
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Affine& point)
|
||||
friend std::ostream& operator<<(std::ostream& os, const Affine& point)
|
||||
{
|
||||
os << "x: " << point.x << "; y: " << point.y;
|
||||
return os;
|
||||
@@ -39,9 +39,9 @@ public:
|
||||
|
||||
template <class FF>
|
||||
struct SharedMemory<Affine<FF>> {
|
||||
__device__ Affine<FF>* getPointer()
|
||||
Affine<FF>* getPointer()
|
||||
{
|
||||
extern __shared__ Affine<FF> s_affine_[];
|
||||
Affine<FF> *s_affine_ = nullptr;
|
||||
return s_affine_;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#pragma once
|
||||
#pragma once
|
||||
#ifndef CURVE_CONFIG_H
|
||||
#define CURVE_CONFIG_H
|
||||
|
||||
#include "fields/id.h"
|
||||
#include "curves/projective.cuh"
|
||||
#include "../fields/id.h"
|
||||
#include "projective.cuh"
|
||||
|
||||
/**
|
||||
* @namespace curve_config
|
||||
@@ -12,23 +12,23 @@
|
||||
* with the `-DCURVE` env variable passed during build.
|
||||
*/
|
||||
#if CURVE_ID == BN254
|
||||
#include "curves/params/bn254.cuh"
|
||||
#include "params/bn254.cuh"
|
||||
namespace curve_config = bn254;
|
||||
|
||||
#elif CURVE_ID == BLS12_381
|
||||
#include "curves/params/bls12_381.cuh"
|
||||
#include "params/bls12_381.cuh"
|
||||
namespace curve_config = bls12_381;
|
||||
|
||||
#elif CURVE_ID == BLS12_377
|
||||
#include "curves/params/bls12_377.cuh"
|
||||
#include "params/bls12_377.cuh"
|
||||
namespace curve_config = bls12_377;
|
||||
|
||||
#elif CURVE_ID == BW6_761
|
||||
#include "curves/params/bw6_761.cuh"
|
||||
#include "params/bw6_761.cuh"
|
||||
namespace curve_config = bw6_761;
|
||||
|
||||
#elif CURVE_ID == GRUMPKIN
|
||||
#include "curves/params/grumpkin.cuh"
|
||||
#include "params/grumpkin.cuh"
|
||||
namespace curve_config = grumpkin;
|
||||
#endif
|
||||
#endif
|
||||
@@ -2,13 +2,13 @@
|
||||
#ifndef BN254_PARAMS_H
|
||||
#define BN254_PARAMS_H
|
||||
|
||||
#include "fields/storage.cuh"
|
||||
#include "../../fields/storage.cuh"
|
||||
|
||||
#include "curves/macro.h"
|
||||
#include "curves/projective.cuh"
|
||||
#include "fields/snark_fields/bn254_base.cuh"
|
||||
#include "fields/snark_fields/bn254_scalar.cuh"
|
||||
#include "fields/quadratic_extension.cuh"
|
||||
#include "../macro.h"
|
||||
#include "../projective.cuh"
|
||||
#include "../../fields/snark_fields/bn254_base.cuh"
|
||||
#include "../../fields/snark_fields/bn254_scalar.cuh"
|
||||
#include "../../fields/quadratic_extension.cuh"
|
||||
|
||||
namespace bn254 {
|
||||
// G1 and G2 generators
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "affine.cuh"
|
||||
#include "gpu-utils/sharedmem.cuh"
|
||||
#include "../gpu-utils/sharedmem.cuh"
|
||||
|
||||
template <typename FF, class SCALAR_FF, const FF& B_VALUE, const FF& GENERATOR_X, const FF& GENERATOR_Y>
|
||||
class Projective
|
||||
@@ -19,34 +19,34 @@ public:
|
||||
FF y;
|
||||
FF z;
|
||||
|
||||
static HOST_DEVICE_INLINE Projective zero() { return {FF::zero(), FF::one(), FF::zero()}; }
|
||||
static Projective zero() { return {FF::zero(), FF::one(), FF::zero()}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Affine<FF> to_affine(const Projective& point)
|
||||
static Affine<FF> to_affine(const Projective& point)
|
||||
{
|
||||
FF denom = FF::inverse(point.z);
|
||||
return {point.x * denom, point.y * denom};
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE Projective from_affine(const Affine<FF>& point)
|
||||
static Projective from_affine(const Affine<FF>& point)
|
||||
{
|
||||
return point == Affine<FF>::zero() ? zero() : Projective{point.x, point.y, FF::one()};
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE Projective to_montgomery(const Projective& point)
|
||||
static Projective to_montgomery(const Projective& point)
|
||||
{
|
||||
return {FF::to_montgomery(point.x), FF::to_montgomery(point.y), FF::to_montgomery(point.z)};
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE Projective from_montgomery(const Projective& point)
|
||||
static Projective from_montgomery(const Projective& point)
|
||||
{
|
||||
return {FF::from_montgomery(point.x), FF::from_montgomery(point.y), FF::from_montgomery(point.z)};
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE Projective generator() { return {GENERATOR_X, GENERATOR_Y, FF::one()}; }
|
||||
static Projective generator() { return {GENERATOR_X, GENERATOR_Y, FF::one()}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; }
|
||||
static Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Projective dbl(const Projective& point)
|
||||
static Projective dbl(const Projective& point)
|
||||
{
|
||||
const FF X = point.x;
|
||||
const FF Y = point.y;
|
||||
@@ -74,7 +74,7 @@ public:
|
||||
return {X3, Y3, Z3};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Projective& p2)
|
||||
friend Projective operator+(Projective p1, const Projective& p2)
|
||||
{
|
||||
const FF X1 = p1.x; // < 2
|
||||
const FF Y1 = p1.y; // < 2
|
||||
@@ -118,9 +118,9 @@ public:
|
||||
return {X3, Y3, Z3};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Projective& p2) { return p1 + neg(p2); }
|
||||
friend Projective operator-(Projective p1, const Projective& p2) { return p1 + neg(p2); }
|
||||
|
||||
friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Affine<FF>& p2)
|
||||
friend Projective operator+(Projective p1, const Affine<FF>& p2)
|
||||
{
|
||||
const FF X1 = p1.x; // < 2
|
||||
const FF Y1 = p1.y; // < 2
|
||||
@@ -163,12 +163,12 @@ public:
|
||||
return {X3, Y3, Z3};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Affine<FF>& p2)
|
||||
friend Projective operator-(Projective p1, const Affine<FF>& p2)
|
||||
{
|
||||
return p1 + Affine<FF>::neg(p2);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Projective operator*(SCALAR_FF scalar, const Projective& point)
|
||||
friend Projective operator*(SCALAR_FF scalar, const Projective& point)
|
||||
{
|
||||
Projective res = zero();
|
||||
#ifdef __CUDA_ARCH__
|
||||
@@ -181,27 +181,27 @@ public:
|
||||
return res;
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Projective operator*(const Projective& point, SCALAR_FF scalar) { return scalar * point; }
|
||||
friend Projective operator*(const Projective& point, SCALAR_FF scalar) { return scalar * point; }
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Projective& p1, const Projective& p2)
|
||||
friend bool operator==(const Projective& p1, const Projective& p2)
|
||||
{
|
||||
return (p1.x * p2.z == p2.x * p1.z) && (p1.y * p2.z == p2.y * p1.z);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator!=(const Projective& p1, const Projective& p2) { return !(p1 == p2); }
|
||||
friend bool operator!=(const Projective& p1, const Projective& p2) { return !(p1 == p2); }
|
||||
|
||||
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Projective& point)
|
||||
friend std::ostream& operator<<(std::ostream& os, const Projective& point)
|
||||
{
|
||||
os << "Point { x: " << point.x << "; y: " << point.y << "; z: " << point.z << " }";
|
||||
return os;
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE bool is_zero(const Projective& point)
|
||||
static bool is_zero(const Projective& point)
|
||||
{
|
||||
return point.x == FF::zero() && point.y != FF::zero() && point.z == FF::zero();
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE bool is_on_curve(const Projective& point)
|
||||
static bool is_on_curve(const Projective& point)
|
||||
{
|
||||
if (is_zero(point)) return true;
|
||||
bool eq_holds =
|
||||
@@ -210,7 +210,7 @@ public:
|
||||
return point.z != FF::zero() && eq_holds;
|
||||
}
|
||||
|
||||
static HOST_INLINE Projective rand_host()
|
||||
static Projective rand_host()
|
||||
{
|
||||
SCALAR_FF rand_scalar = SCALAR_FF::rand_host();
|
||||
return rand_scalar * generator();
|
||||
@@ -231,9 +231,9 @@ public:
|
||||
|
||||
template <typename FF, class SCALAR_FF, const FF& B_VALUE, const FF& GENERATOR_X, const FF& GENERATOR_Y>
|
||||
struct SharedMemory<Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>> {
|
||||
__device__ Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>* getPointer()
|
||||
Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>* getPointer()
|
||||
{
|
||||
extern __shared__ Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y> s_projective_[];
|
||||
Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y> *s_projective_ = nullptr;
|
||||
return s_projective_;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -18,9 +18,9 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "gpu-utils/error_handler.cuh"
|
||||
#include "gpu-utils/modifiers.cuh"
|
||||
#include "gpu-utils/sharedmem.cuh"
|
||||
#include "../gpu-utils/error_handler.cuh"
|
||||
#include "../gpu-utils/modifiers.cuh"
|
||||
#include "../gpu-utils/sharedmem.cuh"
|
||||
#include "host_math.cuh"
|
||||
#include "ptx.cuh"
|
||||
#include "storage.cuh"
|
||||
@@ -38,11 +38,11 @@ public:
|
||||
static constexpr unsigned TLC = CONFIG::limbs_count;
|
||||
static constexpr unsigned NBITS = CONFIG::modulus_bit_count;
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE Field zero() { return Field{CONFIG::zero}; }
|
||||
static constexpr Field zero() { return Field{CONFIG::zero}; }
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE Field one() { return Field{CONFIG::one}; }
|
||||
static constexpr Field one() { return Field{CONFIG::one}; }
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE Field from(uint32_t value)
|
||||
static constexpr Field from(uint32_t value)
|
||||
{
|
||||
storage<TLC> scalar;
|
||||
scalar.limbs[0] = value;
|
||||
@@ -52,7 +52,7 @@ public:
|
||||
return Field{scalar};
|
||||
}
|
||||
|
||||
static HOST_INLINE Field omega(uint32_t logn)
|
||||
static Field omega(uint32_t logn)
|
||||
{
|
||||
if (logn == 0) { return Field{CONFIG::one}; }
|
||||
|
||||
@@ -62,7 +62,7 @@ public:
|
||||
return Field{omega.storages[logn - 1]};
|
||||
}
|
||||
|
||||
static HOST_INLINE Field omega_inv(uint32_t logn)
|
||||
static Field omega_inv(uint32_t logn)
|
||||
{
|
||||
if (logn == 0) { return Field{CONFIG::one}; }
|
||||
|
||||
@@ -74,7 +74,7 @@ public:
|
||||
return Field{omega_inv.storages[logn - 1]};
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE Field inv_log_size(uint32_t logn)
|
||||
static Field inv_log_size(uint32_t logn)
|
||||
{
|
||||
if (logn == 0) { return Field{CONFIG::one}; }
|
||||
#ifndef __CUDA_ARCH__
|
||||
@@ -91,7 +91,7 @@ public:
|
||||
return Field{inv.storages[logn - 1]};
|
||||
}
|
||||
|
||||
static constexpr HOST_INLINE unsigned get_omegas_count()
|
||||
static constexpr unsigned get_omegas_count()
|
||||
{
|
||||
if constexpr (has_member_omegas_count<CONFIG>()) {
|
||||
return CONFIG::omegas_count;
|
||||
@@ -113,45 +113,45 @@ public:
|
||||
/**
|
||||
* A new addition to the config file - \f$ 2^{32 \cdot num\_limbs} - p \f$.
|
||||
*/
|
||||
static constexpr HOST_DEVICE_INLINE ff_storage get_neg_modulus() { return CONFIG::neg_modulus; }
|
||||
static constexpr ff_storage get_neg_modulus() { return CONFIG::neg_modulus; }
|
||||
|
||||
/**
|
||||
* A new addition to the config file - the number of times to reduce in [reduce](@ref reduce) function.
|
||||
*/
|
||||
static constexpr HOST_DEVICE_INLINE unsigned num_of_reductions() { return CONFIG::num_of_reductions; }
|
||||
static constexpr unsigned num_of_reductions() { return CONFIG::num_of_reductions; }
|
||||
|
||||
static constexpr unsigned slack_bits = 32 * TLC - NBITS;
|
||||
|
||||
struct Wide {
|
||||
ff_wide_storage limbs_storage;
|
||||
|
||||
static constexpr Field HOST_DEVICE_INLINE get_lower(const Wide& xs)
|
||||
static constexpr Field get_lower(const Wide& xs)
|
||||
{
|
||||
Field out{};
|
||||
#ifdef __CUDA_ARCH__
|
||||
UNROLL
|
||||
|
||||
#endif
|
||||
for (unsigned i = 0; i < TLC; i++)
|
||||
out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
|
||||
return out;
|
||||
}
|
||||
|
||||
static constexpr Field HOST_DEVICE_INLINE get_higher(const Wide& xs)
|
||||
static constexpr Field get_higher(const Wide& xs)
|
||||
{
|
||||
Field out{};
|
||||
#ifdef __CUDA_ARCH__
|
||||
UNROLL
|
||||
|
||||
#endif
|
||||
for (unsigned i = 0; i < TLC; i++)
|
||||
out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i + TLC];
|
||||
return out;
|
||||
}
|
||||
|
||||
static constexpr Field HOST_DEVICE_INLINE get_higher_with_slack(const Wide& xs)
|
||||
static constexpr Field get_higher_with_slack(const Wide& xs)
|
||||
{
|
||||
Field out{};
|
||||
#ifdef __CUDA_ARCH__
|
||||
UNROLL
|
||||
|
||||
#endif
|
||||
for (unsigned i = 0; i < TLC; i++) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
@@ -166,7 +166,7 @@ public:
|
||||
}
|
||||
|
||||
template <unsigned REDUCTION_SIZE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE Wide sub_modulus_squared(const Wide& xs)
|
||||
static constexpr Wide sub_modulus_squared(const Wide& xs)
|
||||
{
|
||||
if (REDUCTION_SIZE == 0) return xs;
|
||||
const ff_wide_storage modulus = get_modulus_squared<REDUCTION_SIZE>();
|
||||
@@ -175,7 +175,7 @@ public:
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE Wide neg(const Wide& xs)
|
||||
static constexpr Wide neg(const Wide& xs)
|
||||
{
|
||||
const ff_wide_storage modulus = get_modulus_squared<MODULUS_MULTIPLE>();
|
||||
Wide rs = {};
|
||||
@@ -183,14 +183,14 @@ public:
|
||||
return rs;
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Wide operator+(Wide xs, const Wide& ys)
|
||||
friend Wide operator+(Wide xs, const Wide& ys)
|
||||
{
|
||||
Wide rs = {};
|
||||
add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
|
||||
return sub_modulus_squared<1>(rs);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys)
|
||||
friend Wide operator-(Wide xs, const Wide& ys)
|
||||
{
|
||||
Wide rs = {};
|
||||
uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
|
||||
@@ -203,7 +203,7 @@ public:
|
||||
|
||||
// return modulus multiplied by 1, 2 or 4
|
||||
template <unsigned MULTIPLIER = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ff_storage get_modulus()
|
||||
static constexpr ff_storage get_modulus()
|
||||
{
|
||||
switch (MULTIPLIER) {
|
||||
case 1:
|
||||
@@ -218,17 +218,17 @@ public:
|
||||
}
|
||||
|
||||
template <unsigned MULTIPLIER = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ff_wide_storage modulus_wide()
|
||||
static constexpr ff_wide_storage modulus_wide()
|
||||
{
|
||||
return CONFIG::modulus_wide;
|
||||
}
|
||||
|
||||
// return m
|
||||
static constexpr HOST_DEVICE_INLINE ff_storage get_m() { return CONFIG::m; }
|
||||
static constexpr ff_storage get_m() { return CONFIG::m; }
|
||||
|
||||
// return modulus^2, helpful for ab +/- cd
|
||||
template <unsigned MULTIPLIER = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ff_wide_storage get_modulus_squared()
|
||||
static constexpr ff_wide_storage get_modulus_squared()
|
||||
{
|
||||
switch (MULTIPLIER) {
|
||||
case 1:
|
||||
@@ -243,7 +243,7 @@ public:
|
||||
}
|
||||
|
||||
template <bool SUBTRACT, bool CARRY_OUT>
|
||||
static constexpr DEVICE_INLINE uint32_t
|
||||
static constexpr uint32_t
|
||||
add_sub_u32_device(const uint32_t* x, const uint32_t* y, uint32_t* r, size_t n = (TLC >> 1))
|
||||
{
|
||||
r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]);
|
||||
@@ -258,7 +258,7 @@ public:
|
||||
|
||||
// add or subtract limbs
|
||||
template <bool SUBTRACT, bool CARRY_OUT>
|
||||
static constexpr DEVICE_INLINE uint32_t
|
||||
static constexpr uint32_t
|
||||
add_sub_limbs_device(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
|
||||
{
|
||||
const uint32_t* x = xs.limbs;
|
||||
@@ -268,7 +268,7 @@ public:
|
||||
}
|
||||
|
||||
template <bool SUBTRACT, bool CARRY_OUT>
|
||||
static constexpr DEVICE_INLINE uint32_t
|
||||
static constexpr uint32_t
|
||||
add_sub_limbs_device(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs)
|
||||
{
|
||||
const uint32_t* x = xs.limbs;
|
||||
@@ -278,7 +278,7 @@ public:
|
||||
}
|
||||
|
||||
template <bool SUBTRACT, bool CARRY_OUT>
|
||||
static constexpr HOST_INLINE uint32_t add_sub_limbs_host(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
|
||||
static constexpr uint32_t add_sub_limbs_host(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
|
||||
{
|
||||
const uint32_t* x = xs.limbs;
|
||||
const uint32_t* y = ys.limbs;
|
||||
@@ -291,7 +291,7 @@ public:
|
||||
}
|
||||
|
||||
template <bool SUBTRACT, bool CARRY_OUT>
|
||||
static constexpr HOST_INLINE uint32_t
|
||||
static constexpr uint32_t
|
||||
add_sub_limbs_host(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs)
|
||||
{
|
||||
const uint32_t* x = xs.limbs;
|
||||
@@ -305,7 +305,7 @@ public:
|
||||
}
|
||||
|
||||
template <bool CARRY_OUT, typename T>
|
||||
static constexpr HOST_DEVICE_INLINE uint32_t add_limbs(const T& xs, const T& ys, T& rs)
|
||||
static constexpr uint32_t add_limbs(const T& xs, const T& ys, T& rs)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
return add_sub_limbs_device<false, CARRY_OUT>(xs, ys, rs);
|
||||
@@ -315,7 +315,7 @@ public:
|
||||
}
|
||||
|
||||
template <bool CARRY_OUT, typename T>
|
||||
static constexpr HOST_DEVICE_INLINE uint32_t sub_limbs(const T& xs, const T& ys, T& rs)
|
||||
static constexpr uint32_t sub_limbs(const T& xs, const T& ys, T& rs)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
return add_sub_limbs_device<true, CARRY_OUT>(xs, ys, rs);
|
||||
@@ -324,18 +324,18 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void mul_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
|
||||
static void mul_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
|
||||
{
|
||||
UNROLL
|
||||
|
||||
for (size_t i = 0; i < n; i += 2) {
|
||||
acc[i] = ptx::mul_lo(a[i], bi);
|
||||
acc[i + 1] = ptx::mul_hi(a[i], bi);
|
||||
}
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void mul_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t start_i = 0)
|
||||
static void mul_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t start_i = 0)
|
||||
{
|
||||
UNROLL
|
||||
|
||||
for (size_t i = start_i; i < n; i += 2) {
|
||||
acc[i] = ptx::mul_lo(a[i], bi);
|
||||
acc[i + 1] = ptx::mul_hi(a[i], bi);
|
||||
@@ -343,14 +343,14 @@ public:
|
||||
}
|
||||
|
||||
template <bool CARRY_IN = false>
|
||||
static DEVICE_INLINE void
|
||||
static void
|
||||
cmad_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, uint32_t optional_carry = 0)
|
||||
{
|
||||
if (CARRY_IN) ptx::add_cc(UINT32_MAX, optional_carry);
|
||||
acc[0] = CARRY_IN ? ptx::madc_lo_cc(a[0], bi, acc[0]) : ptx::mad_lo_cc(a[0], bi, acc[0]);
|
||||
acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]);
|
||||
|
||||
UNROLL
|
||||
|
||||
for (size_t i = 2; i < n; i += 2) {
|
||||
acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
|
||||
acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
|
||||
@@ -358,7 +358,7 @@ public:
|
||||
}
|
||||
|
||||
template <bool EVEN_PHASE>
|
||||
static DEVICE_INLINE void cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
|
||||
static void cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
|
||||
{
|
||||
if (EVEN_PHASE) {
|
||||
acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
|
||||
@@ -367,14 +367,14 @@ public:
|
||||
acc[1] = ptx::mad_hi_cc(a[0], bi, acc[1]);
|
||||
}
|
||||
|
||||
UNROLL
|
||||
|
||||
for (size_t i = 2; i < n; i += 2) {
|
||||
acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
|
||||
acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
|
||||
}
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void cmad_n_lsb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
|
||||
static void cmad_n_lsb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
|
||||
{
|
||||
if (n > 1)
|
||||
acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
|
||||
@@ -382,7 +382,7 @@ public:
|
||||
acc[0] = ptx::mad_lo(a[0], bi, acc[0]);
|
||||
|
||||
size_t i;
|
||||
UNROLL
|
||||
|
||||
for (i = 1; i < n - 1; i += 2) {
|
||||
acc[i] = ptx::madc_hi_cc(a[i - 1], bi, acc[i]);
|
||||
if (i == n - 2)
|
||||
@@ -394,7 +394,7 @@ public:
|
||||
}
|
||||
|
||||
template <bool CARRY_OUT = false, bool CARRY_IN = false>
|
||||
static DEVICE_INLINE uint32_t mad_row(
|
||||
static uint32_t mad_row(
|
||||
uint32_t* odd,
|
||||
uint32_t* even,
|
||||
const uint32_t* a,
|
||||
@@ -419,7 +419,7 @@ public:
|
||||
}
|
||||
|
||||
template <bool EVEN_PHASE>
|
||||
static DEVICE_INLINE void mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
|
||||
static void mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
|
||||
{
|
||||
cmad_n_msb<!EVEN_PHASE>(odd, EVEN_PHASE ? a : (a + 1), bi, n - 2);
|
||||
odd[EVEN_PHASE ? (n - 1) : (n - 2)] = ptx::madc_lo_cc(a[n - 1], bi, 0);
|
||||
@@ -428,7 +428,7 @@ public:
|
||||
odd[EVEN_PHASE ? n : (n - 1)] = ptx::addc(odd[EVEN_PHASE ? n : (n - 1)], 0);
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void mad_row_lsb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
|
||||
static void mad_row_lsb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
|
||||
{
|
||||
// bi here is constant so we can do a compile-time check for zero (which does happen once for bls12-381 scalar field
|
||||
// modulus)
|
||||
@@ -439,12 +439,12 @@ public:
|
||||
return;
|
||||
}
|
||||
|
||||
static DEVICE_INLINE uint32_t
|
||||
static uint32_t
|
||||
mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, uint32_t* extra, size_t n = (TLC >> 1))
|
||||
{
|
||||
acc[0] = ptx::mad_lo_cc(a[0], bi, extra[0]);
|
||||
|
||||
UNROLL
|
||||
|
||||
for (size_t i = 1; i < n - 1; i += 2) {
|
||||
acc[i] = ptx::madc_hi_cc(a[i - 1], bi, extra[i]);
|
||||
acc[i + 1] = ptx::madc_lo_cc(a[i + 1], bi, extra[i + 1]);
|
||||
@@ -467,19 +467,19 @@ public:
|
||||
* \cdot b_0}{2^{32}}} + \dots + \floor{\frac{a_0 \cdot b_{TLC - 2}}{2^{32}}}) \leq 2^{64} + 2\cdot 2^{96} + \dots +
|
||||
* (TLC - 2) \cdot 2^{32(TLC - 1)} + (TLC - 1) \cdot 2^{32(TLC - 1)} \leq 2(TLC - 1) \cdot 2^{32(TLC - 1)}\f$.
|
||||
*/
|
||||
static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
|
||||
static void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
|
||||
{
|
||||
if constexpr (TLC > 1) {
|
||||
const uint32_t* a = as.limbs;
|
||||
const uint32_t* b = bs.limbs;
|
||||
uint32_t* even = rs.limbs;
|
||||
__align__(16) uint32_t odd[2 * TLC - 2];
|
||||
uint32_t odd[2 * TLC - 2];
|
||||
|
||||
even[TLC - 1] = ptx::mul_hi(a[TLC - 2], b[0]);
|
||||
odd[TLC - 2] = ptx::mul_lo(a[TLC - 1], b[0]);
|
||||
odd[TLC - 1] = ptx::mul_hi(a[TLC - 1], b[0]);
|
||||
size_t i;
|
||||
UNROLL
|
||||
|
||||
for (i = 2; i < TLC - 1; i += 2) {
|
||||
mad_row_msb<true>(&even[TLC - 2], &odd[TLC - 2], &a[TLC - i - 1], b[i - 1], i + 1);
|
||||
mad_row_msb<false>(&odd[TLC - 2], &even[TLC - 2], &a[TLC - i - 2], b[i], i + 2);
|
||||
@@ -504,7 +504,7 @@ public:
|
||||
* is excluded if \f$ i + j > TLC - 1 \f$ and only the lower half is included if \f$ i + j = TLC - 1 \f$. All other
|
||||
* limb products are included.
|
||||
*/
|
||||
static DEVICE_INLINE void
|
||||
static void
|
||||
multiply_and_add_lsb_neg_modulus_raw_device(const ff_storage& as, ff_storage& cs, ff_storage& rs)
|
||||
{
|
||||
ff_storage bs = get_neg_modulus();
|
||||
@@ -514,7 +514,7 @@ public:
|
||||
uint32_t* even = rs.limbs;
|
||||
|
||||
if constexpr (TLC > 2) {
|
||||
__align__(16) uint32_t odd[TLC - 1];
|
||||
uint32_t odd[TLC - 1];
|
||||
size_t i;
|
||||
// `b[0]` is \f$ 2^{32} \f$ minus the last limb of prime modulus. Because most scalar (and some base) primes
|
||||
// are necessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to
|
||||
@@ -528,7 +528,6 @@ public:
|
||||
mul_n(odd, a + 1, b[0], TLC - 1);
|
||||
}
|
||||
mad_row_lsb(&even[2], &odd[0], a, b[1], TLC - 1);
|
||||
UNROLL
|
||||
for (i = 2; i < TLC - 1; i += 2) {
|
||||
mad_row_lsb(&odd[i], &even[i], a, b[i], TLC - i);
|
||||
mad_row_lsb(&even[i + 2], &odd[i], a, b[i + 1], TLC - i - 1);
|
||||
@@ -558,15 +557,15 @@ public:
|
||||
* that the top bit of \f$ a_{hi} \f$ and \f$ b_{hi} \f$ are unset. This ensures correctness by allowing to keep the
|
||||
* result inside TLC limbs and ignore the carries from the highest limb.
|
||||
*/
|
||||
static DEVICE_INLINE void
|
||||
static void
|
||||
multiply_and_add_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even, uint32_t* in1, uint32_t* in2)
|
||||
{
|
||||
__align__(16) uint32_t odd[TLC - 2];
|
||||
uint32_t odd[TLC - 2];
|
||||
uint32_t first_row_carry = mul_n_and_add(even, a, b[0], in1);
|
||||
uint32_t carry = mul_n_and_add(odd, a + 1, b[0], &in2[1]);
|
||||
|
||||
size_t i;
|
||||
UNROLL
|
||||
|
||||
for (i = 2; i < ((TLC >> 1) - 1); i += 2) {
|
||||
carry = mad_row<true, false>(
|
||||
&even[i], &odd[i - 2], a, b[i - 1], TLC >> 1, in1[(TLC >> 1) + i - 2], in1[(TLC >> 1) + i - 1], carry);
|
||||
@@ -587,15 +586,15 @@ public:
|
||||
* This method multiplies `a` and `b` and writes the result into `even`. It assumes that `a` and `b` are TLC/2 limbs
|
||||
* long. The usual schoolbook algorithm is used.
|
||||
*/
|
||||
static DEVICE_INLINE void multiply_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even)
|
||||
static void multiply_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even)
|
||||
{
|
||||
__align__(16) uint32_t odd[TLC - 2];
|
||||
uint32_t odd[TLC - 2];
|
||||
mul_n(even, a, b[0], TLC >> 1);
|
||||
mul_n(odd, a + 1, b[0], TLC >> 1);
|
||||
mad_row(&even[2], &odd[0], a, b[1], TLC >> 1);
|
||||
|
||||
size_t i;
|
||||
UNROLL
|
||||
|
||||
for (i = 2; i < ((TLC >> 1) - 1); i += 2) {
|
||||
mad_row(&odd[i], &even[i], a, b[i], TLC >> 1);
|
||||
mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC >> 1);
|
||||
@@ -614,7 +613,7 @@ public:
|
||||
* with so far. This method implements [subtractive
|
||||
* Karatsuba](https://en.wikipedia.org/wiki/Karatsuba_algorithm#Implementation).
|
||||
*/
|
||||
static DEVICE_INLINE void multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
|
||||
static void multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
|
||||
{
|
||||
const uint32_t* a = as.limbs;
|
||||
const uint32_t* b = bs.limbs;
|
||||
@@ -624,8 +623,8 @@ public:
|
||||
// write the results into `r`.
|
||||
multiply_short_raw_device(a, b, r);
|
||||
multiply_short_raw_device(&a[TLC >> 1], &b[TLC >> 1], &r[TLC]);
|
||||
__align__(16) uint32_t middle_part[TLC];
|
||||
__align__(16) uint32_t diffs[TLC];
|
||||
uint32_t middle_part[TLC];
|
||||
uint32_t diffs[TLC];
|
||||
// Differences of halves \f$ a_{hi} - a_{lo}; b_{lo} - b_{hi} \$f are written into `diffs`, signs written to
|
||||
// `carry1` and `carry2`.
|
||||
uint32_t carry1 = add_sub_u32_device<true, true>(&a[TLC >> 1], a, diffs);
|
||||
@@ -644,7 +643,7 @@ public:
|
||||
for (size_t i = TLC + (TLC >> 1); i < 2 * TLC; i++)
|
||||
r[i] = ptx::addc_cc(r[i], 0);
|
||||
} else if (TLC == 2) {
|
||||
__align__(8) uint32_t odd[2];
|
||||
uint32_t odd[2];
|
||||
r[0] = ptx::mul_lo(a[0], b[0]);
|
||||
r[1] = ptx::mul_hi(a[0], b[0]);
|
||||
r[2] = ptx::mul_lo(a[1], b[1]);
|
||||
@@ -662,7 +661,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
static HOST_INLINE void multiply_raw_host(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
|
||||
static void multiply_raw_host(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
|
||||
{
|
||||
const uint32_t* a = as.limbs;
|
||||
const uint32_t* b = bs.limbs;
|
||||
@@ -675,7 +674,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE void multiply_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
|
||||
static void multiply_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
return multiply_raw_device(as, bs, rs);
|
||||
@@ -684,7 +683,7 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE void
|
||||
static void
|
||||
multiply_and_add_lsb_neg_modulus_raw(const ff_storage& as, ff_storage& cs, ff_storage& rs)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
@@ -697,7 +696,7 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE void multiply_msb_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
|
||||
static void multiply_msb_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
return multiply_msb_raw_device(as, bs, rs);
|
||||
@@ -709,9 +708,9 @@ public:
|
||||
public:
|
||||
ff_storage limbs_storage;
|
||||
|
||||
HOST_DEVICE_INLINE uint32_t* export_limbs() { return (uint32_t*)limbs_storage.limbs; }
|
||||
uint32_t* export_limbs() { return (uint32_t*)limbs_storage.limbs; }
|
||||
|
||||
HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
|
||||
unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
|
||||
{
|
||||
const uint32_t limb_lsb_idx = (digit_num * digit_width) / 32;
|
||||
const uint32_t shift_bits = (digit_num * digit_width) % 32;
|
||||
@@ -723,7 +722,7 @@ public:
|
||||
return rv;
|
||||
}
|
||||
|
||||
static HOST_INLINE Field rand_host()
|
||||
static Field rand_host()
|
||||
{
|
||||
std::random_device rd;
|
||||
std::mt19937_64 generator(rd());
|
||||
@@ -743,7 +742,7 @@ public:
|
||||
}
|
||||
|
||||
template <unsigned REDUCTION_SIZE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE Field sub_modulus(const Field& xs)
|
||||
static constexpr Field sub_modulus(const Field& xs)
|
||||
{
|
||||
if (REDUCTION_SIZE == 0) return xs;
|
||||
const ff_storage modulus = get_modulus<REDUCTION_SIZE>();
|
||||
@@ -764,14 +763,14 @@ public:
|
||||
return os;
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Field operator+(Field xs, const Field& ys)
|
||||
friend Field operator+(Field xs, const Field& ys)
|
||||
{
|
||||
Field rs = {};
|
||||
add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
|
||||
return sub_modulus<1>(rs);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Field operator-(Field xs, const Field& ys)
|
||||
friend Field operator-(Field xs, const Field& ys)
|
||||
{
|
||||
Field rs = {};
|
||||
uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
|
||||
@@ -782,7 +781,7 @@ public:
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE Wide mul_wide(const Field& xs, const Field& ys)
|
||||
static constexpr Wide mul_wide(const Field& xs, const Field& ys)
|
||||
{
|
||||
Wide rs = {};
|
||||
multiply_raw(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
|
||||
@@ -811,7 +810,7 @@ public:
|
||||
* will cause only 1 reduction to be performed.
|
||||
*/
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE Field reduce(const Wide& xs)
|
||||
static constexpr Field reduce(const Wide& xs)
|
||||
{
|
||||
// `xs` is left-shifted by `2 * slack_bits` and higher half is written to `xs_hi`
|
||||
Field xs_hi = Wide::get_higher_with_slack(xs);
|
||||
@@ -836,19 +835,19 @@ public:
|
||||
return r;
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys)
|
||||
friend Field operator*(const Field& xs, const Field& ys)
|
||||
{
|
||||
Wide xy = mul_wide(xs, ys); // full mult
|
||||
return reduce(xy); // reduce mod p
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Field& xs, const Field& ys)
|
||||
friend bool operator==(const Field& xs, const Field& ys)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
const uint32_t* x = xs.limbs_storage.limbs;
|
||||
const uint32_t* y = ys.limbs_storage.limbs;
|
||||
uint32_t limbs_or = x[0] ^ y[0];
|
||||
UNROLL
|
||||
|
||||
for (unsigned i = 1; i < TLC; i++)
|
||||
limbs_or |= x[i] ^ y[i];
|
||||
return limbs_or == 0;
|
||||
@@ -859,15 +858,15 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator!=(const Field& xs, const Field& ys) { return !(xs == ys); }
|
||||
friend bool operator!=(const Field& xs, const Field& ys) { return !(xs == ys); }
|
||||
|
||||
template <const Field& multiplier>
|
||||
static HOST_DEVICE_INLINE Field mul_const(const Field& xs)
|
||||
static Field mul_const(const Field& xs)
|
||||
{
|
||||
Field mul = multiplier;
|
||||
static bool is_u32 = true;
|
||||
#ifdef __CUDA_ARCH__
|
||||
UNROLL
|
||||
|
||||
#endif
|
||||
for (unsigned i = 1; i < TLC; i++)
|
||||
is_u32 &= (mul.limbs_storage.limbs[i] == 0);
|
||||
@@ -877,13 +876,13 @@ public:
|
||||
}
|
||||
|
||||
template <uint32_t multiplier, class T, unsigned REDUCTION_SIZE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE T mul_unsigned(const T& xs)
|
||||
static constexpr T mul_unsigned(const T& xs)
|
||||
{
|
||||
T rs = {};
|
||||
T temp = xs;
|
||||
bool is_zero = true;
|
||||
#ifdef __CUDA_ARCH__
|
||||
UNROLL
|
||||
|
||||
#endif
|
||||
for (unsigned i = 0; i < 32; i++) {
|
||||
if (multiplier & (1 << i)) {
|
||||
@@ -897,28 +896,28 @@ public:
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE Wide sqr_wide(const Field& xs)
|
||||
static constexpr Wide sqr_wide(const Field& xs)
|
||||
{
|
||||
// TODO: change to a more efficient squaring
|
||||
return mul_wide<MODULUS_MULTIPLE>(xs, xs);
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE Field sqr(const Field& xs)
|
||||
static constexpr Field sqr(const Field& xs)
|
||||
{
|
||||
// TODO: change to a more efficient squaring
|
||||
return xs * xs;
|
||||
}
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE Field to_montgomery(const Field& xs) { return xs * Field{CONFIG::montgomery_r}; }
|
||||
static constexpr Field to_montgomery(const Field& xs) { return xs * Field{CONFIG::montgomery_r}; }
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE Field from_montgomery(const Field& xs)
|
||||
static constexpr Field from_montgomery(const Field& xs)
|
||||
{
|
||||
return xs * Field{CONFIG::montgomery_r_inv};
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE Field neg(const Field& xs)
|
||||
static constexpr Field neg(const Field& xs)
|
||||
{
|
||||
const ff_storage modulus = get_modulus<MODULUS_MULTIPLE>();
|
||||
Field rs = {};
|
||||
@@ -928,14 +927,14 @@ public:
|
||||
|
||||
// Assumes the number is even!
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE Field div2(const Field& xs)
|
||||
static constexpr Field div2(const Field& xs)
|
||||
{
|
||||
const uint32_t* x = xs.limbs_storage.limbs;
|
||||
Field rs = {};
|
||||
uint32_t* r = rs.limbs_storage.limbs;
|
||||
if constexpr (TLC > 1) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
UNROLL
|
||||
|
||||
#endif
|
||||
for (unsigned i = 0; i < TLC - 1; i++) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
@@ -949,18 +948,18 @@ public:
|
||||
return sub_modulus<MODULUS_MULTIPLE>(rs);
|
||||
}
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE bool lt(const Field& xs, const Field& ys)
|
||||
static constexpr bool lt(const Field& xs, const Field& ys)
|
||||
{
|
||||
ff_storage dummy = {};
|
||||
uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, dummy);
|
||||
return carry;
|
||||
}
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE bool is_odd(const Field& xs) { return xs.limbs_storage.limbs[0] & 1; }
|
||||
static constexpr bool is_odd(const Field& xs) { return xs.limbs_storage.limbs[0] & 1; }
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE bool is_even(const Field& xs) { return ~xs.limbs_storage.limbs[0] & 1; }
|
||||
static constexpr bool is_even(const Field& xs) { return ~xs.limbs_storage.limbs[0] & 1; }
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE Field inverse(const Field& xs)
|
||||
static constexpr Field inverse(const Field& xs)
|
||||
{
|
||||
if (xs == zero()) return zero();
|
||||
constexpr Field one = Field{CONFIG::one};
|
||||
@@ -1007,9 +1006,9 @@ struct std::hash<Field<CONFIG>> {
|
||||
|
||||
template <class CONFIG>
|
||||
struct SharedMemory<Field<CONFIG>> {
|
||||
__device__ Field<CONFIG>* getPointer()
|
||||
Field<CONFIG>* getPointer()
|
||||
{
|
||||
extern __shared__ Field<CONFIG> s_scalar_[];
|
||||
Field<CONFIG> *s_scalar_;
|
||||
return s_scalar_;
|
||||
}
|
||||
};
|
||||
@@ -2,8 +2,8 @@
|
||||
#ifndef FIELD_CONFIG_H
|
||||
#define FIELD_CONFIG_H
|
||||
|
||||
#include "fields/id.h"
|
||||
#include "fields/field.cuh"
|
||||
#include "id.h"
|
||||
#include "field.cuh"
|
||||
|
||||
/**
|
||||
* @namespace field_config
|
||||
@@ -11,27 +11,27 @@
|
||||
* with the `-DFIELD` env variable passed during build.
|
||||
*/
|
||||
#if FIELD_ID == BN254
|
||||
#include "fields/snark_fields/bn254_scalar.cuh"
|
||||
#include "snark_fields/bn254_scalar.cuh"
|
||||
namespace field_config = bn254;
|
||||
#elif FIELD_ID == BLS12_381
|
||||
#include "fields/snark_fields/bls12_381_scalar.cuh"
|
||||
#include "snark_fields/bls12_381_scalar.cuh"
|
||||
using bls12_381::fp_config;
|
||||
namespace field_config = bls12_381;
|
||||
#elif FIELD_ID == BLS12_377
|
||||
#include "fields/snark_fields/bls12_377_scalar.cuh"
|
||||
#include "snark_fields/bls12_377_scalar.cuh"
|
||||
namespace field_config = bls12_377;
|
||||
#elif FIELD_ID == BW6_761
|
||||
#include "fields/snark_fields/bw6_761_scalar.cuh"
|
||||
#include "snark_fields/bw6_761_scalar.cuh"
|
||||
namespace field_config = bw6_761;
|
||||
#elif FIELD_ID == GRUMPKIN
|
||||
#include "fields/snark_fields/grumpkin_scalar.cuh"
|
||||
#include "snark_fields/grumpkin_scalar.cuh"
|
||||
namespace field_config = grumpkin;
|
||||
|
||||
#elif FIELD_ID == BABY_BEAR
|
||||
#include "fields/stark_fields/babybear.cuh"
|
||||
#include "stark_fields/babybear.cuh"
|
||||
namespace field_config = babybear;
|
||||
#elif FIELD_ID == STARK_252
|
||||
#include "fields/stark_fields/stark252.cuh"
|
||||
#include "stark_fields/stark252.cuh"
|
||||
namespace field_config = stark252;
|
||||
#endif
|
||||
|
||||
|
||||
@@ -3,98 +3,97 @@
|
||||
#define HOST_MATH_H
|
||||
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
#include "gpu-utils/modifiers.cuh"
|
||||
#include "../gpu-utils/modifiers.cuh"
|
||||
namespace host_math {
|
||||
|
||||
// return x + y with uint32_t operands
|
||||
static __host__ uint32_t add(const uint32_t x, const uint32_t y) { return x + y; }
|
||||
// return x + y with uint32_t operands
|
||||
static uint32_t add(const uint32_t x, const uint32_t y) { return x + y; }
|
||||
|
||||
// return x + y + carry with uint32_t operands
|
||||
static __host__ uint32_t addc(const uint32_t x, const uint32_t y, const uint32_t carry) { return x + y + carry; }
|
||||
// return x + y + carry with uint32_t operands
|
||||
static uint32_t addc(const uint32_t x, const uint32_t y, const uint32_t carry) { return x + y + carry; }
|
||||
|
||||
// return x + y and carry out with uint32_t operands
|
||||
static __host__ uint32_t add_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
|
||||
// return x + y and carry out with uint32_t operands
|
||||
static uint32_t add_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
|
||||
{
|
||||
uint32_t result;
|
||||
result = x + y;
|
||||
carry = x > result;
|
||||
return result;
|
||||
}
|
||||
|
||||
// return x + y + carry and carry out with uint32_t operands
|
||||
static uint32_t addc_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
|
||||
{
|
||||
const uint32_t result = x + y + carry;
|
||||
carry = carry && x >= result || !carry && x > result;
|
||||
return result;
|
||||
}
|
||||
|
||||
// return x - y with uint32_t operands
|
||||
static uint32_t sub(const uint32_t x, const uint32_t y) { return x - y; }
|
||||
|
||||
// return x - y - borrow with uint32_t operands
|
||||
static uint32_t subc(const uint32_t x, const uint32_t y, const uint32_t borrow) { return x - y - borrow; }
|
||||
|
||||
// return x - y and borrow out with uint32_t operands
|
||||
static uint32_t sub_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
|
||||
{
|
||||
uint32_t result;
|
||||
result = x - y;
|
||||
borrow = x < result;
|
||||
return result;
|
||||
}
|
||||
|
||||
// return x - y - borrow and borrow out with uint32_t operands
|
||||
static uint32_t subc_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
|
||||
{
|
||||
const uint32_t result = x - y - borrow;
|
||||
borrow = borrow && x <= result || !borrow && x < result;
|
||||
return result;
|
||||
}
|
||||
|
||||
// return x * y + z + carry and carry out with uint32_t operands
|
||||
static uint32_t madc_cc(const uint32_t x, const uint32_t y, const uint32_t z, uint32_t& carry)
|
||||
{
|
||||
uint32_t result;
|
||||
uint64_t r = static_cast<uint64_t>(x) * y + z + carry;
|
||||
carry = (uint32_t)(r >> 32);
|
||||
result = r & 0xffffffff;
|
||||
return result;
|
||||
}
|
||||
|
||||
template <unsigned OPS_COUNT = UINT32_MAX, bool CARRY_IN = false, bool CARRY_OUT = false>
|
||||
struct carry_chain {
|
||||
unsigned index;
|
||||
|
||||
constexpr carry_chain() : index(0) {}
|
||||
|
||||
uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
|
||||
{
|
||||
uint32_t result;
|
||||
result = x + y;
|
||||
carry = x > result;
|
||||
return result;
|
||||
index++;
|
||||
if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
|
||||
return host_math::add(x, y);
|
||||
else if (index == 1 && !CARRY_IN)
|
||||
return host_math::add_cc(x, y, carry);
|
||||
else if (index < OPS_COUNT || CARRY_OUT)
|
||||
return host_math::addc_cc(x, y, carry);
|
||||
else
|
||||
return host_math::addc(x, y, carry);
|
||||
}
|
||||
|
||||
// return x + y + carry and carry out with uint32_t operands
|
||||
static __host__ uint32_t addc_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
|
||||
uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
|
||||
{
|
||||
const uint32_t result = x + y + carry;
|
||||
carry = carry && x >= result || !carry && x > result;
|
||||
return result;
|
||||
index++;
|
||||
if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
|
||||
return host_math::sub(x, y);
|
||||
else if (index == 1 && !CARRY_IN)
|
||||
return host_math::sub_cc(x, y, carry);
|
||||
else if (index < OPS_COUNT || CARRY_OUT)
|
||||
return host_math::subc_cc(x, y, carry);
|
||||
else
|
||||
return host_math::subc(x, y, carry);
|
||||
}
|
||||
|
||||
// return x - y with uint32_t operands
|
||||
static __host__ uint32_t sub(const uint32_t x, const uint32_t y) { return x - y; }
|
||||
|
||||
// return x - y - borrow with uint32_t operands
|
||||
static __host__ uint32_t subc(const uint32_t x, const uint32_t y, const uint32_t borrow) { return x - y - borrow; }
|
||||
|
||||
// return x - y and borrow out with uint32_t operands
|
||||
static __host__ uint32_t sub_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
|
||||
{
|
||||
uint32_t result;
|
||||
result = x - y;
|
||||
borrow = x < result;
|
||||
return result;
|
||||
}
|
||||
|
||||
// return x - y - borrow and borrow out with uint32_t operands
|
||||
static __host__ uint32_t subc_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
|
||||
{
|
||||
const uint32_t result = x - y - borrow;
|
||||
borrow = borrow && x <= result || !borrow && x < result;
|
||||
return result;
|
||||
}
|
||||
|
||||
// return x * y + z + carry and carry out with uint32_t operands
|
||||
static __host__ uint32_t madc_cc(const uint32_t x, const uint32_t y, const uint32_t z, uint32_t& carry)
|
||||
{
|
||||
uint32_t result;
|
||||
uint64_t r = static_cast<uint64_t>(x) * y + z + carry;
|
||||
carry = (uint32_t)(r >> 32);
|
||||
result = r & 0xffffffff;
|
||||
return result;
|
||||
}
|
||||
|
||||
template <unsigned OPS_COUNT = UINT32_MAX, bool CARRY_IN = false, bool CARRY_OUT = false>
|
||||
struct carry_chain {
|
||||
unsigned index;
|
||||
|
||||
constexpr HOST_INLINE carry_chain() : index(0) {}
|
||||
|
||||
HOST_INLINE uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
|
||||
{
|
||||
index++;
|
||||
if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
|
||||
return host_math::add(x, y);
|
||||
else if (index == 1 && !CARRY_IN)
|
||||
return host_math::add_cc(x, y, carry);
|
||||
else if (index < OPS_COUNT || CARRY_OUT)
|
||||
return host_math::addc_cc(x, y, carry);
|
||||
else
|
||||
return host_math::addc(x, y, carry);
|
||||
}
|
||||
|
||||
HOST_INLINE uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
|
||||
{
|
||||
index++;
|
||||
if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
|
||||
return host_math::sub(x, y);
|
||||
else if (index == 1 && !CARRY_IN)
|
||||
return host_math::sub_cc(x, y, carry);
|
||||
else if (index < OPS_COUNT || CARRY_OUT)
|
||||
return host_math::subc_cc(x, y, carry);
|
||||
else
|
||||
return host_math::subc(x, y, carry);
|
||||
}
|
||||
};
|
||||
};
|
||||
} // namespace host_math
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,139 +1,119 @@
|
||||
#pragma once
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace ptx {
|
||||
|
||||
__device__ __forceinline__ uint32_t add(const uint32_t x, const uint32_t y)
|
||||
uint32_t add(const uint32_t x, const uint32_t y)
|
||||
{
|
||||
uint32_t result;
|
||||
asm("add.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t add_cc(const uint32_t x, const uint32_t y)
|
||||
uint32_t add_cc(const uint32_t x, const uint32_t y)
|
||||
{
|
||||
uint32_t result;
|
||||
asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t addc(const uint32_t x, const uint32_t y)
|
||||
uint32_t addc(const uint32_t x, const uint32_t y)
|
||||
{
|
||||
uint32_t result;
|
||||
asm volatile("addc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t addc_cc(const uint32_t x, const uint32_t y)
|
||||
uint32_t addc_cc(const uint32_t x, const uint32_t y)
|
||||
{
|
||||
uint32_t result;
|
||||
asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t sub(const uint32_t x, const uint32_t y)
|
||||
uint32_t sub(const uint32_t x, const uint32_t y)
|
||||
{
|
||||
uint32_t result;
|
||||
asm("sub.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t sub_cc(const uint32_t x, const uint32_t y)
|
||||
uint32_t sub_cc(const uint32_t x, const uint32_t y)
|
||||
{
|
||||
uint32_t result;
|
||||
asm volatile("sub.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t subc(const uint32_t x, const uint32_t y)
|
||||
uint32_t subc(const uint32_t x, const uint32_t y)
|
||||
{
|
||||
uint32_t result;
|
||||
asm volatile("subc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t subc_cc(const uint32_t x, const uint32_t y)
|
||||
uint32_t subc_cc(const uint32_t x, const uint32_t y)
|
||||
{
|
||||
uint32_t result;
|
||||
asm volatile("subc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t mul_lo(const uint32_t x, const uint32_t y)
|
||||
uint32_t mul_lo(const uint32_t x, const uint32_t y)
|
||||
{
|
||||
uint32_t result;
|
||||
asm("mul.lo.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t mul_hi(const uint32_t x, const uint32_t y)
|
||||
uint32_t mul_hi(const uint32_t x, const uint32_t y)
|
||||
{
|
||||
uint32_t result;
|
||||
asm("mul.hi.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t mad_lo(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
uint32_t mad_lo(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
{
|
||||
uint32_t result;
|
||||
asm("mad.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t mad_hi(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
uint32_t mad_hi(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
{
|
||||
uint32_t result;
|
||||
asm("mad.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t mad_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
uint32_t mad_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
{
|
||||
uint32_t result;
|
||||
asm volatile("mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t mad_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
uint32_t mad_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
{
|
||||
uint32_t result;
|
||||
asm volatile("mad.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t madc_lo(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
uint32_t madc_lo(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
{
|
||||
uint32_t result;
|
||||
asm volatile("madc.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t madc_hi(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
uint32_t madc_hi(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
{
|
||||
uint32_t result;
|
||||
asm volatile("madc.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t madc_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
uint32_t madc_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
{
|
||||
uint32_t result;
|
||||
asm volatile("madc.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t madc_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
uint32_t madc_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
|
||||
{
|
||||
uint32_t result;
|
||||
asm volatile("madc.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
|
||||
uint32_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t mov_b64(uint32_t lo, uint32_t hi)
|
||||
uint64_t mov_b64(uint32_t lo, uint32_t hi)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("mov.b64 %0, {%1,%2};" : "=l"(result) : "r"(lo), "r"(hi));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -141,142 +121,124 @@ namespace ptx {
|
||||
// Callers should know exactly what they're calling (no implicit conversions).
|
||||
namespace u64 {
|
||||
|
||||
__device__ __forceinline__ uint64_t add(const uint64_t x, const uint64_t y)
|
||||
uint64_t add(const uint64_t x, const uint64_t y)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("add.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t add_cc(const uint64_t x, const uint64_t y)
|
||||
uint64_t add_cc(const uint64_t x, const uint64_t y)
|
||||
{
|
||||
uint64_t result;
|
||||
asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t addc(const uint64_t x, const uint64_t y)
|
||||
uint64_t addc(const uint64_t x, const uint64_t y)
|
||||
{
|
||||
uint64_t result;
|
||||
asm volatile("addc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t addc_cc(const uint64_t x, const uint64_t y)
|
||||
uint64_t addc_cc(const uint64_t x, const uint64_t y)
|
||||
{
|
||||
uint64_t result;
|
||||
asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t sub(const uint64_t x, const uint64_t y)
|
||||
uint64_t sub(const uint64_t x, const uint64_t y)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("sub.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t sub_cc(const uint64_t x, const uint64_t y)
|
||||
uint64_t sub_cc(const uint64_t x, const uint64_t y)
|
||||
{
|
||||
uint64_t result;
|
||||
asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t subc(const uint64_t x, const uint64_t y)
|
||||
uint64_t subc(const uint64_t x, const uint64_t y)
|
||||
{
|
||||
uint64_t result;
|
||||
asm volatile("subc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t subc_cc(const uint64_t x, const uint64_t y)
|
||||
uint64_t subc_cc(const uint64_t x, const uint64_t y)
|
||||
{
|
||||
uint64_t result;
|
||||
asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t mul_lo(const uint64_t x, const uint64_t y)
|
||||
uint64_t mul_lo(const uint64_t x, const uint64_t y)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("mul.lo.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t mul_hi(const uint64_t x, const uint64_t y)
|
||||
uint64_t mul_hi(const uint64_t x, const uint64_t y)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("mul.hi.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t mad_lo(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
uint64_t mad_lo(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("mad.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t mad_hi(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
uint64_t mad_hi(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("mad.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t mad_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
uint64_t mad_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
{
|
||||
uint64_t result;
|
||||
asm volatile("mad.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t mad_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
uint64_t mad_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
{
|
||||
uint64_t result;
|
||||
asm volatile("mad.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t madc_lo(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
uint64_t madc_lo(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
{
|
||||
uint64_t result;
|
||||
asm volatile("madc.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t madc_hi(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
uint64_t madc_hi(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
{
|
||||
uint64_t result;
|
||||
asm volatile("madc.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t madc_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
uint64_t madc_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
{
|
||||
uint64_t result;
|
||||
asm volatile("madc.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint64_t madc_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
uint64_t madc_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
|
||||
{
|
||||
uint64_t result;
|
||||
asm volatile("madc.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
|
||||
uint64_t result = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace u64
|
||||
|
||||
__device__ __forceinline__ void bar_arrive(const unsigned name, const unsigned count)
|
||||
void bar_arrive(const unsigned name, const unsigned count)
|
||||
{
|
||||
asm volatile("bar.arrive %0, %1;" : : "r"(name), "r"(count) : "memory");
|
||||
return;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void bar_sync(const unsigned name, const unsigned count)
|
||||
void bar_sync(const unsigned name, const unsigned count)
|
||||
{
|
||||
asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(count) : "memory");
|
||||
return;
|
||||
}
|
||||
|
||||
} // namespace ptx
|
||||
@@ -1,8 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include "field.cuh"
|
||||
#include "gpu-utils/modifiers.cuh"
|
||||
#include "gpu-utils/sharedmem.cuh"
|
||||
#include "../gpu-utils/modifiers.cuh"
|
||||
#include "../gpu-utils/sharedmem.cuh"
|
||||
|
||||
template <typename CONFIG>
|
||||
class ExtensionField
|
||||
@@ -16,12 +16,12 @@ private:
|
||||
FWide real;
|
||||
FWide imaginary;
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
|
||||
friend ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
|
||||
{
|
||||
return ExtensionWide{xs.real + ys.real, xs.imaginary + ys.imaginary};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
|
||||
friend ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
|
||||
{
|
||||
return ExtensionWide{xs.real - ys.real, xs.imaginary - ys.imaginary};
|
||||
}
|
||||
@@ -34,21 +34,21 @@ public:
|
||||
FF real;
|
||||
FF imaginary;
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField zero() { return ExtensionField{FF::zero(), FF::zero()}; }
|
||||
static constexpr ExtensionField zero() { return ExtensionField{FF::zero(), FF::zero()}; }
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField one() { return ExtensionField{FF::one(), FF::zero()}; }
|
||||
static constexpr ExtensionField one() { return ExtensionField{FF::one(), FF::zero()}; }
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField to_montgomery(const ExtensionField& xs)
|
||||
static constexpr ExtensionField to_montgomery(const ExtensionField& xs)
|
||||
{
|
||||
return ExtensionField{xs.real * FF{CONFIG::montgomery_r}, xs.imaginary * FF{CONFIG::montgomery_r}};
|
||||
}
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField from_montgomery(const ExtensionField& xs)
|
||||
static constexpr ExtensionField from_montgomery(const ExtensionField& xs)
|
||||
{
|
||||
return ExtensionField{xs.real * FF{CONFIG::montgomery_r_inv}, xs.imaginary * FF{CONFIG::montgomery_r_inv}};
|
||||
}
|
||||
|
||||
static HOST_INLINE ExtensionField rand_host() { return ExtensionField{FF::rand_host(), FF::rand_host()}; }
|
||||
static ExtensionField rand_host() { return ExtensionField{FF::rand_host(), FF::rand_host()}; }
|
||||
|
||||
static void rand_host_many(ExtensionField* out, int size)
|
||||
{
|
||||
@@ -57,7 +57,7 @@ public:
|
||||
}
|
||||
|
||||
template <unsigned REDUCTION_SIZE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs)
|
||||
static constexpr ExtensionField sub_modulus(const ExtensionField& xs)
|
||||
{
|
||||
return ExtensionField{FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.imaginary)};
|
||||
}
|
||||
@@ -68,38 +68,38 @@ public:
|
||||
return os;
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
|
||||
friend ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
|
||||
{
|
||||
return ExtensionField{xs.real + ys.real, xs.imaginary + ys.imaginary};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
|
||||
friend ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
|
||||
{
|
||||
return ExtensionField{xs.real - ys.real, xs.imaginary - ys.imaginary};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionField operator+(FF xs, const ExtensionField& ys)
|
||||
friend ExtensionField operator+(FF xs, const ExtensionField& ys)
|
||||
{
|
||||
return ExtensionField{xs + ys.real, ys.imaginary};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionField operator-(FF xs, const ExtensionField& ys)
|
||||
friend ExtensionField operator-(FF xs, const ExtensionField& ys)
|
||||
{
|
||||
return ExtensionField{xs - ys.real, FF::neg(ys.imaginary)};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const FF& ys)
|
||||
friend ExtensionField operator+(ExtensionField xs, const FF& ys)
|
||||
{
|
||||
return ExtensionField{xs.real + ys, xs.imaginary};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const FF& ys)
|
||||
friend ExtensionField operator-(ExtensionField xs, const FF& ys)
|
||||
{
|
||||
return ExtensionField{xs.real - ys, xs.imaginary};
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
|
||||
static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
|
||||
{
|
||||
FWide real_prod = FF::mul_wide(xs.real, ys.real);
|
||||
FWide imaginary_prod = FF::mul_wide(xs.imaginary, ys.imaginary);
|
||||
@@ -110,40 +110,40 @@ public:
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
|
||||
static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
|
||||
{
|
||||
return ExtensionWide{FF::mul_wide(xs.real, ys), FF::mul_wide(xs.imaginary, ys)};
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
|
||||
static constexpr ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
|
||||
{
|
||||
return mul_wide(ys, xs);
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs)
|
||||
static constexpr ExtensionField reduce(const ExtensionWide& xs)
|
||||
{
|
||||
return ExtensionField{
|
||||
FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.imaginary)};
|
||||
}
|
||||
|
||||
template <class T1, class T2>
|
||||
friend HOST_DEVICE_INLINE ExtensionField operator*(const T1& xs, const T2& ys)
|
||||
friend ExtensionField operator*(const T1& xs, const T2& ys)
|
||||
{
|
||||
ExtensionWide xy = mul_wide(xs, ys);
|
||||
return reduce(xy);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys)
|
||||
friend bool operator==(const ExtensionField& xs, const ExtensionField& ys)
|
||||
{
|
||||
return (xs.real == ys.real) && (xs.imaginary == ys.imaginary);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
|
||||
friend bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
|
||||
|
||||
template <const ExtensionField& multiplier>
|
||||
static HOST_DEVICE_INLINE ExtensionField mul_const(const ExtensionField& xs)
|
||||
static ExtensionField mul_const(const ExtensionField& xs)
|
||||
{
|
||||
static constexpr FF mul_real = multiplier.real;
|
||||
static constexpr FF mul_imaginary = multiplier.imaginary;
|
||||
@@ -159,33 +159,33 @@ public:
|
||||
}
|
||||
|
||||
template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs)
|
||||
static constexpr ExtensionField mul_unsigned(const ExtensionField& xs)
|
||||
{
|
||||
return {FF::template mul_unsigned<multiplier>(xs.real), FF::template mul_unsigned<multiplier>(xs.imaginary)};
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionWide sqr_wide(const ExtensionField& xs)
|
||||
static constexpr ExtensionWide sqr_wide(const ExtensionField& xs)
|
||||
{
|
||||
// TODO: change to a more efficient squaring
|
||||
return mul_wide<MODULUS_MULTIPLE>(xs, xs);
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField sqr(const ExtensionField& xs)
|
||||
static constexpr ExtensionField sqr(const ExtensionField& xs)
|
||||
{
|
||||
// TODO: change to a more efficient squaring
|
||||
return xs * xs;
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField neg(const ExtensionField& xs)
|
||||
static constexpr ExtensionField neg(const ExtensionField& xs)
|
||||
{
|
||||
return ExtensionField{FF::neg(xs.real), FF::neg(xs.imaginary)};
|
||||
}
|
||||
|
||||
// inverse of zero is set to be zero which is what we want most of the time
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs)
|
||||
static constexpr ExtensionField inverse(const ExtensionField& xs)
|
||||
{
|
||||
ExtensionField xs_conjugate = {xs.real, FF::neg(xs.imaginary)};
|
||||
FF nonresidue_times_im = FF::template mul_unsigned<CONFIG::nonresidue>(FF::sqr(xs.imaginary));
|
||||
@@ -198,9 +198,9 @@ public:
|
||||
|
||||
template <class CONFIG>
|
||||
struct SharedMemory<ExtensionField<CONFIG>> {
|
||||
__device__ ExtensionField<CONFIG>* getPointer()
|
||||
ExtensionField<CONFIG>* getPointer()
|
||||
{
|
||||
extern __shared__ ExtensionField<CONFIG> s_ext2_scalar_[];
|
||||
ExtensionField<CONFIG> *s_ext2_scalar_;
|
||||
return s_ext2_scalar_;
|
||||
}
|
||||
};
|
||||
@@ -1,8 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include "field.cuh"
|
||||
#include "gpu-utils/modifiers.cuh"
|
||||
#include "gpu-utils/sharedmem.cuh"
|
||||
#include "../gpu-utils/modifiers.cuh"
|
||||
#include "../gpu-utils/sharedmem.cuh"
|
||||
|
||||
template <typename CONFIG>
|
||||
class ExtensionField
|
||||
@@ -16,12 +16,12 @@ private:
|
||||
FWide im2;
|
||||
FWide im3;
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
|
||||
friend ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
|
||||
{
|
||||
return ExtensionWide{xs.real + ys.real, xs.im1 + ys.im1, xs.im2 + ys.im2, xs.im3 + ys.im3};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
|
||||
friend ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
|
||||
{
|
||||
return ExtensionWide{xs.real - ys.real, xs.im1 - ys.im1, xs.im2 - ys.im2, xs.im3 - ys.im3};
|
||||
}
|
||||
@@ -36,31 +36,31 @@ public:
|
||||
FF im2;
|
||||
FF im3;
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField zero()
|
||||
static constexpr ExtensionField zero()
|
||||
{
|
||||
return ExtensionField{FF::zero(), FF::zero(), FF::zero(), FF::zero()};
|
||||
}
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField one()
|
||||
static constexpr ExtensionField one()
|
||||
{
|
||||
return ExtensionField{FF::one(), FF::zero(), FF::zero(), FF::zero()};
|
||||
}
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField to_montgomery(const ExtensionField& xs)
|
||||
static constexpr ExtensionField to_montgomery(const ExtensionField& xs)
|
||||
{
|
||||
return ExtensionField{
|
||||
xs.real * FF{CONFIG::montgomery_r}, xs.im1 * FF{CONFIG::montgomery_r}, xs.im2 * FF{CONFIG::montgomery_r},
|
||||
xs.im3 * FF{CONFIG::montgomery_r}};
|
||||
}
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField from_montgomery(const ExtensionField& xs)
|
||||
static constexpr ExtensionField from_montgomery(const ExtensionField& xs)
|
||||
{
|
||||
return ExtensionField{
|
||||
xs.real * FF{CONFIG::montgomery_r_inv}, xs.im1 * FF{CONFIG::montgomery_r_inv},
|
||||
xs.im2 * FF{CONFIG::montgomery_r_inv}, xs.im3 * FF{CONFIG::montgomery_r_inv}};
|
||||
}
|
||||
|
||||
static HOST_INLINE ExtensionField rand_host()
|
||||
static ExtensionField rand_host()
|
||||
{
|
||||
return ExtensionField{FF::rand_host(), FF::rand_host(), FF::rand_host(), FF::rand_host()};
|
||||
}
|
||||
@@ -72,7 +72,7 @@ public:
|
||||
}
|
||||
|
||||
template <unsigned REDUCTION_SIZE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs)
|
||||
static constexpr ExtensionField sub_modulus(const ExtensionField& xs)
|
||||
{
|
||||
return ExtensionField{
|
||||
FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.im1),
|
||||
@@ -86,38 +86,38 @@ public:
|
||||
return os;
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
|
||||
friend ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
|
||||
{
|
||||
return ExtensionField{xs.real + ys.real, xs.im1 + ys.im1, xs.im2 + ys.im2, xs.im3 + ys.im3};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
|
||||
friend ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
|
||||
{
|
||||
return ExtensionField{xs.real - ys.real, xs.im1 - ys.im1, xs.im2 - ys.im2, xs.im3 - ys.im3};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionField operator+(FF xs, const ExtensionField& ys)
|
||||
friend ExtensionField operator+(FF xs, const ExtensionField& ys)
|
||||
{
|
||||
return ExtensionField{xs + ys.real, ys.im1, ys.im2, ys.im3};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionField operator-(FF xs, const ExtensionField& ys)
|
||||
friend ExtensionField operator-(FF xs, const ExtensionField& ys)
|
||||
{
|
||||
return ExtensionField{xs - ys.real, FF::neg(ys.im1), FF::neg(ys.im2), FF::neg(ys.im3)};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const FF& ys)
|
||||
friend ExtensionField operator+(ExtensionField xs, const FF& ys)
|
||||
{
|
||||
return ExtensionField{xs.real + ys, xs.im1, xs.im2, xs.im3};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const FF& ys)
|
||||
friend ExtensionField operator-(ExtensionField xs, const FF& ys)
|
||||
{
|
||||
return ExtensionField{xs.real - ys, xs.im1, xs.im2, xs.im3};
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
|
||||
static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
|
||||
{
|
||||
if (CONFIG::nonresidue_is_negative)
|
||||
return ExtensionWide{
|
||||
@@ -144,21 +144,21 @@ public:
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
|
||||
static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
|
||||
{
|
||||
return ExtensionWide{
|
||||
FF::mul_wide(xs.real, ys), FF::mul_wide(xs.im1, ys), FF::mul_wide(xs.im2, ys), FF::mul_wide(xs.im3, ys)};
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
|
||||
static constexpr ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
|
||||
{
|
||||
return ExtensionWide{
|
||||
FF::mul_wide(xs, ys.real), FF::mul_wide(xs, ys.im1), FF::mul_wide(xs, ys.im2), FF::mul_wide(xs, ys.im3)};
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs)
|
||||
static constexpr ExtensionField reduce(const ExtensionWide& xs)
|
||||
{
|
||||
return ExtensionField{
|
||||
FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.im1),
|
||||
@@ -166,21 +166,21 @@ public:
|
||||
}
|
||||
|
||||
template <class T1, class T2>
|
||||
friend HOST_DEVICE_INLINE ExtensionField operator*(const T1& xs, const T2& ys)
|
||||
friend ExtensionField operator*(const T1& xs, const T2& ys)
|
||||
{
|
||||
ExtensionWide xy = mul_wide(xs, ys);
|
||||
return reduce(xy);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys)
|
||||
friend bool operator==(const ExtensionField& xs, const ExtensionField& ys)
|
||||
{
|
||||
return (xs.real == ys.real) && (xs.im1 == ys.im1) && (xs.im2 == ys.im2) && (xs.im3 == ys.im3);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
|
||||
friend bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
|
||||
|
||||
template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs)
|
||||
static constexpr ExtensionField mul_unsigned(const ExtensionField& xs)
|
||||
{
|
||||
return {
|
||||
FF::template mul_unsigned<multiplier>(xs.real), FF::template mul_unsigned<multiplier>(xs.im1),
|
||||
@@ -188,27 +188,27 @@ public:
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionWide sqr_wide(const ExtensionField& xs)
|
||||
static constexpr ExtensionWide sqr_wide(const ExtensionField& xs)
|
||||
{
|
||||
// TODO: change to a more efficient squaring
|
||||
return mul_wide<MODULUS_MULTIPLE>(xs, xs);
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField sqr(const ExtensionField& xs)
|
||||
static constexpr ExtensionField sqr(const ExtensionField& xs)
|
||||
{
|
||||
// TODO: change to a more efficient squaring
|
||||
return xs * xs;
|
||||
}
|
||||
|
||||
template <unsigned MODULUS_MULTIPLE = 1>
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField neg(const ExtensionField& xs)
|
||||
static constexpr ExtensionField neg(const ExtensionField& xs)
|
||||
{
|
||||
return {FF::neg(xs.real), FF::neg(xs.im1), FF::neg(xs.im2), FF::neg(xs.im3)};
|
||||
}
|
||||
|
||||
// inverse of zero is set to be zero which is what we want most of the time
|
||||
static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs)
|
||||
static constexpr ExtensionField inverse(const ExtensionField& xs)
|
||||
{
|
||||
FF x, x0, x2;
|
||||
if (CONFIG::nonresidue_is_negative) {
|
||||
@@ -249,9 +249,9 @@ public:
|
||||
|
||||
template <class CONFIG>
|
||||
struct SharedMemory<ExtensionField<CONFIG>> {
|
||||
__device__ ExtensionField<CONFIG>* getPointer()
|
||||
ExtensionField<CONFIG>* getPointer()
|
||||
{
|
||||
extern __shared__ ExtensionField<CONFIG> s_ext4_scalar_[];
|
||||
ExtensionField<CONFIG> *s_ext4_scalar_=nullptr;
|
||||
return s_ext4_scalar_;
|
||||
}
|
||||
};
|
||||
@@ -2,7 +2,7 @@
|
||||
#ifndef BN254_BASE_PARAMS_H
|
||||
#define BN254_BASE_PARAMS_H
|
||||
|
||||
#include "fields/storage.cuh"
|
||||
#include "../storage.cuh"
|
||||
|
||||
namespace bn254 {
|
||||
struct fq_config {
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
#ifndef BN254_SCALAR_PARAMS_H
|
||||
#define BN254_SCALAR_PARAMS_H
|
||||
|
||||
#include "fields/storage.cuh"
|
||||
#include "fields/field.cuh"
|
||||
#include "fields/quadratic_extension.cuh"
|
||||
#include "../storage.cuh"
|
||||
#include "../field.cuh"
|
||||
#include "../quadratic_extension.cuh"
|
||||
|
||||
namespace bn254 {
|
||||
struct fp_config {
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include "fields/storage.cuh"
|
||||
#include "fields/field.cuh"
|
||||
#include "fields/quartic_extension.cuh"
|
||||
#include "../storage.cuh"
|
||||
#include "../field.cuh"
|
||||
#include "../quartic_extension.cuh"
|
||||
|
||||
namespace babybear {
|
||||
struct fp_config {
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
#pragma once
|
||||
#ifndef DEVICE_CONTEXT_H
|
||||
#define DEVICE_CONTEXT_H
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstddef>
|
||||
|
||||
namespace device_context {
|
||||
|
||||
constexpr std::size_t MAX_DEVICES = 32;
|
||||
size_t MAX_DEVICES = 32;
|
||||
|
||||
|
||||
/**
|
||||
* Properties of the device used in icicle functions.
|
||||
*/
|
||||
struct DeviceContext {
|
||||
cudaStream_t& stream; /**< Stream to use. Default value: 0. */
|
||||
int stream; /**< Stream to use. Default value: 0. */
|
||||
std::size_t device_id; /**< Index of the currently used GPU. Default value: 0. */
|
||||
cudaMemPool_t mempool; /**< Mempool to use. Default value: 0. */
|
||||
int mempool; /**< Mempool to use. Default value: 0. */
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -22,9 +22,9 @@ namespace device_context {
|
||||
*/
|
||||
inline DeviceContext get_default_device_context() // TODO: naming convention ?
|
||||
{
|
||||
static cudaStream_t default_stream = (cudaStream_t)0;
|
||||
static int default_stream = 0;
|
||||
return DeviceContext{
|
||||
(cudaStream_t&)default_stream, // stream
|
||||
default_stream, // stream
|
||||
0, // device_id
|
||||
0, // mempool
|
||||
};
|
||||
|
||||
@@ -3,12 +3,10 @@
|
||||
#define ERR_H
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
enum class IcicleError_t {
|
||||
enum IcicleError_t {
|
||||
IcicleSuccess = 0,
|
||||
InvalidArgument = 1,
|
||||
MemoryAllocationError = 2,
|
||||
@@ -38,14 +36,14 @@ private:
|
||||
|
||||
public:
|
||||
// Constructor for cudaError_t with optional message
|
||||
IcicleError(cudaError_t cudaError, const std::string& msg = "")
|
||||
: std::runtime_error("CUDA Error: " + std::string(cudaGetErrorString(cudaError)) + " " + msg),
|
||||
IcicleError(int cudaError, const std::string& msg = "")
|
||||
: std::runtime_error("Error: " + msg),
|
||||
errCode(static_cast<int>(cudaError))
|
||||
{
|
||||
}
|
||||
|
||||
// Constructor for cudaError_t with const char* message
|
||||
IcicleError(cudaError_t cudaError, const char* msg) : IcicleError(cudaError, std::string(msg)) {}
|
||||
IcicleError(int cudaError, const char* msg) : IcicleError(cudaError, std::string(msg)) {}
|
||||
|
||||
// Constructor for IcicleError_t with optional message
|
||||
IcicleError(IcicleError_t icicleError, const std::string& msg = "")
|
||||
@@ -67,11 +65,10 @@ public:
|
||||
#define CHK_LOG(val) check((val), #val, __FILE__, __LINE__)
|
||||
#define CHK_VAL(val, file, line) check((val), #val, file, line)
|
||||
|
||||
cudaError_t inline check(cudaError_t err, const char* const func, const char* const file, const int line)
|
||||
int inline check(int err, const char* const func, const char* const file, const int line)
|
||||
{
|
||||
if (err != cudaSuccess) {
|
||||
if (err != 0) {
|
||||
std::cerr << "CUDA Runtime Error by: " << func << " at: " << file << ":" << line << std::endl;
|
||||
std::cerr << cudaGetErrorString(err) << std::endl << std::endl;
|
||||
}
|
||||
|
||||
return err;
|
||||
@@ -90,12 +87,12 @@ cudaError_t inline check(cudaError_t err, const char* const func, const char* co
|
||||
#define THROW_ICICLE_CUDA(val) throwIcicleCudaErr(val, __FUNCTION__, __FILE__, __LINE__)
|
||||
#define THROW_ICICLE_CUDA_ERR(val, func, file, line) throwIcicleCudaErr(val, func, file, line)
|
||||
void inline throwIcicleCudaErr(
|
||||
cudaError_t err, const char* const func, const char* const file, const int line, bool isUnrecoverable = true)
|
||||
int err, const char* const func, const char* const file, const int line, bool isUnrecoverable = true)
|
||||
{
|
||||
// TODO: fmt::format introduced only in C++20
|
||||
std::string err_msg = (isUnrecoverable ? "!!!Unrecoverable!!! : " : "") + std::string{cudaGetErrorString(err)} +
|
||||
" : detected by: " + func + " at: " + file + ":" + std::to_string(line) +
|
||||
"\nThe error is reported there and may be caused by prior calls.\n";
|
||||
std::string err_msg = (isUnrecoverable ? "!!!Unrecoverable!!! : " : "");
|
||||
// + " : detected by: " + func + " at: " + file + ":" + std::to_string(line) +
|
||||
// "\nThe error is reported there and may be caused by prior calls.\n";
|
||||
std::cerr << err_msg << std::endl; // TODO: Logging
|
||||
throw IcicleError{err, err_msg};
|
||||
}
|
||||
@@ -111,14 +108,14 @@ void inline throwIcicleErr(
|
||||
throw IcicleError{err, err_msg};
|
||||
}
|
||||
|
||||
cudaError_t inline checkCudaErrorIsSticky(
|
||||
cudaError_t err, const char* const func, const char* const file, const int line, bool isThrowing = true)
|
||||
int inline checkCudaErrorIsSticky(
|
||||
int err, const char* const func, const char* const file, const int line, bool isThrowing = true)
|
||||
{
|
||||
if (err != cudaSuccess) {
|
||||
if (err != 0) {
|
||||
// check for sticky (unrecoverable) error when the only option is to restart process
|
||||
cudaError_t err2 = cudaDeviceSynchronize();
|
||||
int err2 = 0;
|
||||
bool is_logged;
|
||||
if (err2 != cudaSuccess) { // we suspect sticky error
|
||||
if (err2 != 0) { // we suspect sticky error
|
||||
if (err != err2) {
|
||||
is_logged = true;
|
||||
CHK_ERR(err, func, file, line);
|
||||
@@ -139,13 +136,13 @@ cudaError_t inline checkCudaErrorIsSticky(
|
||||
// most common macros to use
|
||||
#define CHK_INIT_IF_RETURN() \
|
||||
{ \
|
||||
cudaError_t err_result = CHK_LAST(); \
|
||||
int err_result = CHK_LAST(); \
|
||||
if (err_result != cudaSuccess) return err_result; \
|
||||
}
|
||||
|
||||
#define CHK_IF_RETURN(val) \
|
||||
{ \
|
||||
cudaError_t err_result = CHK_STICKY(val); \
|
||||
int err_result = CHK_STICKY(val); \
|
||||
if (err_result != cudaSuccess) return err_result; \
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,6 @@
|
||||
#define UNROLL #pragma unroll
|
||||
#endif
|
||||
|
||||
#define HOST_INLINE __host__ INLINE_MACRO
|
||||
#define DEVICE_INLINE __device__ INLINE_MACRO
|
||||
#define HOST_DEVICE_INLINE __host__ __device__ INLINE_MACRO
|
||||
// #define __host__ INLINE_MACRO
|
||||
// #define INLINE_MACRO
|
||||
// #define __host__ INLINE_MACRO
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
* definitions.
|
||||
*
|
||||
* To use dynamically allocated shared memory in a templatized __global__ or
|
||||
* __device__ function, just replace code like this:
|
||||
* function, just replace code like this:
|
||||
*
|
||||
* <pre>
|
||||
* template<class T>
|
||||
@@ -32,7 +32,7 @@
|
||||
* foo( T* d_out, T* d_in)
|
||||
* {
|
||||
* // Shared mem size is determined by the host app at run time
|
||||
* extern __shared__ T sdata[];
|
||||
* T sdata[];
|
||||
* ...
|
||||
* doStuff(sdata);
|
||||
* ...
|
||||
@@ -62,7 +62,7 @@
|
||||
*
|
||||
* This struct uses template specialization on the type \a T to declare
|
||||
* a differently named dynamic shared memory array for each type
|
||||
* (\code extern __shared__ T s_type[] \endcode).
|
||||
* (\code T s_type[] \endcode).
|
||||
*
|
||||
* Currently there are specializations for the following types:
|
||||
* \c int, \c uint, \c char, \c uchar, \c short, \c ushort, \c long,
|
||||
@@ -73,11 +73,10 @@ template <typename T>
|
||||
struct SharedMemory {
|
||||
//! @brief Return a pointer to the runtime-sized shared memory array.
|
||||
//! @returns Pointer to runtime-sized shared memory array
|
||||
__device__ T* getPointer()
|
||||
T* getPointer()
|
||||
{
|
||||
// extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
|
||||
// Error_UnsupportedType();
|
||||
return (T*)0;
|
||||
T* a = nullptr; // Initialize pointer to nullptr or allocate memory as needed
|
||||
return a;
|
||||
}
|
||||
// TODO: Use operator overloading to make this class look like a regular array
|
||||
};
|
||||
@@ -88,129 +87,128 @@ struct SharedMemory {
|
||||
|
||||
template <>
|
||||
struct SharedMemory<int> {
|
||||
__device__ int* getPointer()
|
||||
int* getPointer()
|
||||
{
|
||||
extern __shared__ int s_int[];
|
||||
return s_int;
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory<unsigned int> {
|
||||
__device__ unsigned int* getPointer()
|
||||
unsigned int* getPointer()
|
||||
{
|
||||
extern __shared__ unsigned int s_uint[];
|
||||
return s_uint;
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory<char> {
|
||||
__device__ char* getPointer()
|
||||
char* getPointer()
|
||||
{
|
||||
extern __shared__ char s_char[];
|
||||
return s_char;
|
||||
char *a = nullptr;
|
||||
return a;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory<unsigned char> {
|
||||
__device__ unsigned char* getPointer()
|
||||
unsigned char* getPointer()
|
||||
{
|
||||
extern __shared__ unsigned char s_uchar[];
|
||||
return s_uchar;
|
||||
unsigned char* a = nullptr;
|
||||
return a;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory<short> {
|
||||
__device__ short* getPointer()
|
||||
short* getPointer()
|
||||
{
|
||||
extern __shared__ short s_short[];
|
||||
return s_short;
|
||||
short* a = nullptr;
|
||||
return a;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory<unsigned short> {
|
||||
__device__ unsigned short* getPointer()
|
||||
unsigned short* getPointer()
|
||||
{
|
||||
extern __shared__ unsigned short s_ushort[];
|
||||
return s_ushort;
|
||||
unsigned short* a = nullptr;
|
||||
return a;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory<long> {
|
||||
__device__ long* getPointer()
|
||||
long* getPointer()
|
||||
{
|
||||
extern __shared__ long s_long[];
|
||||
long *s_long = nullptr;
|
||||
return s_long;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory<unsigned long> {
|
||||
__device__ unsigned long* getPointer()
|
||||
unsigned long* getPointer()
|
||||
{
|
||||
extern __shared__ unsigned long s_ulong[];
|
||||
unsigned long *s_ulong = nullptr;
|
||||
return s_ulong;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory<long long> {
|
||||
__device__ long long* getPointer()
|
||||
long long* getPointer()
|
||||
{
|
||||
extern __shared__ long long s_longlong[];
|
||||
long long *s_longlong;
|
||||
return s_longlong;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory<unsigned long long> {
|
||||
__device__ unsigned long long* getPointer()
|
||||
unsigned long long* getPointer()
|
||||
{
|
||||
extern __shared__ unsigned long long s_ulonglong[];
|
||||
unsigned long long *s_ulonglong;
|
||||
return s_ulonglong;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory<bool> {
|
||||
__device__ bool* getPointer()
|
||||
bool* getPointer()
|
||||
{
|
||||
extern __shared__ bool s_bool[];
|
||||
bool *s_bool;
|
||||
return s_bool;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory<float> {
|
||||
__device__ float* getPointer()
|
||||
float* getPointer()
|
||||
{
|
||||
extern __shared__ float s_float[];
|
||||
float *s_float;
|
||||
return s_float;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory<double> {
|
||||
__device__ double* getPointer()
|
||||
double* getPointer()
|
||||
{
|
||||
extern __shared__ double s_double[];
|
||||
double *s_double;
|
||||
return s_double;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory<uchar4> {
|
||||
__device__ uchar4* getPointer()
|
||||
{
|
||||
extern __shared__ uchar4 s_uchar4[];
|
||||
return s_uchar4;
|
||||
}
|
||||
};
|
||||
|
||||
// template <>
|
||||
// struct SharedMemory<uchar4> {
|
||||
// uchar4* getPointer()
|
||||
// {
|
||||
// uchar4 *s_uchar4;
|
||||
// return s_uchar4;
|
||||
// }
|
||||
// };
|
||||
|
||||
#endif //_SHAREDMEM_H_
|
||||
|
||||
|
||||
@@ -3,9 +3,9 @@
|
||||
#define KECCAK_H
|
||||
|
||||
#include <cstdint>
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
#include "gpu-utils/error_handler.cuh"
|
||||
|
||||
#include "../../gpu-utils/device_context.cuh"
|
||||
#include "../../gpu-utils/error_handler.cuh"
|
||||
typedef int cudaError_t;
|
||||
namespace keccak {
|
||||
/**
|
||||
* @struct KeccakConfig
|
||||
|
||||
@@ -2,13 +2,12 @@
|
||||
#ifndef MSM_H
|
||||
#define MSM_H
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "curves/affine.cuh"
|
||||
#include "curves/projective.cuh"
|
||||
#include "fields/field.cuh"
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
#include "gpu-utils/error_handler.cuh"
|
||||
#include "../curves/affine.cuh"
|
||||
#include "../curves/projective.cuh"
|
||||
#include "../fields/field.cuh"
|
||||
#include "../gpu-utils/device_context.cuh"
|
||||
#include "../gpu-utils/error_handler.cuh"
|
||||
|
||||
/**
|
||||
* @namespace msm
|
||||
@@ -124,6 +123,8 @@ namespace msm {
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*
|
||||
*/
|
||||
|
||||
typedef int cudaError_t;
|
||||
template <typename S, typename A, typename P>
|
||||
cudaError_t msm(const S* scalars, const A* points, int msm_size, MSMConfig& config, P* results);
|
||||
|
||||
|
||||
@@ -2,13 +2,11 @@
|
||||
#ifndef NTT_H
|
||||
#define NTT_H
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
#include "gpu-utils/error_handler.cuh"
|
||||
#include "gpu-utils/sharedmem.cuh"
|
||||
#include "utils/utils_kernels.cuh"
|
||||
#include "utils/utils.h"
|
||||
#include "../gpu-utils/device_context.cuh"
|
||||
#include "../gpu-utils/error_handler.cuh"
|
||||
#include "../gpu-utils/sharedmem.cuh"
|
||||
#include "../utils/utils_kernels.cuh"
|
||||
#include "../utils/utils.h"
|
||||
|
||||
/**
|
||||
* @namespace ntt
|
||||
@@ -36,6 +34,8 @@ namespace ntt {
|
||||
* primitive_root).
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
typedef int cudaError_t;
|
||||
|
||||
template <typename S>
|
||||
cudaError_t init_domain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode = false);
|
||||
|
||||
|
||||
@@ -3,8 +3,9 @@
|
||||
#define _NTT_IMPL_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "ntt/ntt.cuh" // for enum Ordering
|
||||
|
||||
#include "ntt.cuh" // for enum Ordering
|
||||
typedef int cudaError_t;
|
||||
typedef int cudaStream_t;
|
||||
namespace mxntt {
|
||||
|
||||
template <typename S>
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
#include "fields/field_config.cuh"
|
||||
#include "polynomials/polynomials.h"
|
||||
#include "../../gpu-utils/device_context.cuh"
|
||||
#include "../../fields/field_config.cuh"
|
||||
#include "../polynomials.h"
|
||||
|
||||
using device_context::DeviceContext;
|
||||
|
||||
@@ -11,7 +11,7 @@ namespace polynomials {
|
||||
class CUDAPolynomialFactory : public AbstractPolynomialFactory<C, D, I>
|
||||
{
|
||||
std::vector<DeviceContext> m_device_contexts; // device-id --> device context
|
||||
std::vector<cudaStream_t> m_device_streams; // device-id --> device stream. Storing the streams here as workaround
|
||||
std::vector<int> m_device_streams; // device-id --> device stream. Storing the streams here as workaround
|
||||
// since DeviceContext has a reference to a stream.
|
||||
|
||||
public:
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#include <algorithm> // for std::max
|
||||
#include <cstdint> // for uint64_t, etc.
|
||||
#include <memory>
|
||||
#include "utils/integrity_pointer.h"
|
||||
#include "../utils/integrity_pointer.h"
|
||||
|
||||
namespace polynomials {
|
||||
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include "utils/integrity_pointer.h"
|
||||
#include "fields/field_config.cuh"
|
||||
#include "../utils/integrity_pointer.h"
|
||||
#include "../fields/field_config.cuh"
|
||||
|
||||
#include "polynomial_context.h"
|
||||
#include "polynomial_backend.h"
|
||||
|
||||
@@ -4,9 +4,9 @@
|
||||
|
||||
#include <cstdint>
|
||||
#include <stdexcept>
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
#include "gpu-utils/error_handler.cuh"
|
||||
#include "utils/utils.h"
|
||||
#include "../gpu-utils/device_context.cuh"
|
||||
#include "../gpu-utils/error_handler.cuh"
|
||||
#include "../utils/utils.h"
|
||||
|
||||
/**
|
||||
* @namespace poseidon
|
||||
@@ -117,6 +117,7 @@ namespace poseidon {
|
||||
/**
|
||||
* Loads pre-calculated optimized constants, moves them to the device
|
||||
*/
|
||||
typedef int cudaError_t;
|
||||
template <typename S>
|
||||
cudaError_t
|
||||
init_optimized_poseidon_constants(int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* constants);
|
||||
|
||||
@@ -9,7 +9,7 @@ namespace mont {
|
||||
#define MAX_THREADS_PER_BLOCK 256
|
||||
|
||||
template <typename E, bool is_into>
|
||||
__global__ void MontgomeryKernel(const E* input, int n, E* output)
|
||||
void MontgomeryKernel(const E* input, int n, E* output)
|
||||
{
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < n) { output[tid] = is_into ? E::to_montgomery(input[tid]) : E::from_montgomery(input[tid]); }
|
||||
|
||||
@@ -4,14 +4,13 @@
|
||||
|
||||
namespace utils_internal {
|
||||
template <typename E, typename S>
|
||||
__global__ void NormalizeKernel(E* arr, S scalar, int n)
|
||||
void NormalizeKernel(E* arr, S scalar, int n)
|
||||
{
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < n) { arr[tid] = scalar * arr[tid]; }
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
__global__ void BatchMulKernel(
|
||||
void BatchMulKernel(
|
||||
const E* in_vec,
|
||||
int n_elements,
|
||||
int batch_size,
|
||||
@@ -22,12 +21,7 @@ namespace utils_internal {
|
||||
bool bitrev,
|
||||
E* out_vec)
|
||||
{
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid < n_elements * batch_size) {
|
||||
int64_t scalar_id = tid % n_elements;
|
||||
if (bitrev) scalar_id = __brev(scalar_id) >> (32 - logn);
|
||||
out_vec[tid] = *(scalar_vec + ((scalar_id * step) % n_scalars)) * in_vec[tid];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
} // namespace utils_internal
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#ifndef LDE_H
|
||||
#define LDE_H
|
||||
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
#include "../gpu-utils/device_context.cuh"
|
||||
|
||||
/**
|
||||
* @namespace vec_ops
|
||||
@@ -57,6 +57,7 @@ namespace vec_ops {
|
||||
* @tparam E The type of elements `vec_b` and `result`. Often (but not always) `E=S`.
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
typedef int cudaError_t;
|
||||
template <typename E, typename S>
|
||||
cudaError_t Mul(const S* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result);
|
||||
|
||||
|
||||
@@ -1,29 +1,26 @@
|
||||
if (G2)
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2")
|
||||
endif ()
|
||||
|
||||
set(TARGET icicle_curve)
|
||||
set(FIELD_TARGET icicle_field)
|
||||
|
||||
set(SRC ${CMAKE_SOURCE_DIR}/src)
|
||||
set(SRC ../../)
|
||||
|
||||
set(CURVE_SOURCE ${SRC}/curves/extern.cu)
|
||||
set(CURVE_SOURCE ${SRC}/curves/extern.cpp)
|
||||
if(G2)
|
||||
list(APPEND CURVE_SOURCE ${SRC}/curves/extern_g2.cu)
|
||||
endif()
|
||||
if(MSM)
|
||||
list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cu)
|
||||
if(G2)
|
||||
list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cu)
|
||||
endif()
|
||||
endif()
|
||||
if(ECNTT)
|
||||
list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cu)
|
||||
list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cu)
|
||||
list(APPEND CURVE_SOURCE ${SRC}/curves/extern_g2.cpp)
|
||||
endif()
|
||||
# if(MSM)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
|
||||
# if(G2)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
|
||||
# endif()
|
||||
# endif()
|
||||
# if(ECNTT)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
|
||||
# endif()
|
||||
|
||||
add_library(${TARGET} STATIC ${CURVE_SOURCE})
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
|
||||
target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
|
||||
target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
|
||||
target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
|
||||
target_compile_features(${TARGET} PUBLIC cxx_std_17)
|
||||
40
icicle/src/curves/extern.cpp
Normal file
40
icicle/src/curves/extern.cpp
Normal file
@@ -0,0 +1,40 @@
|
||||
#define CURVE_ID BN254
|
||||
#include "../../include/curves/curve_config.cuh"
|
||||
|
||||
using namespace curve_config;
|
||||
|
||||
#include "../../include/gpu-utils/device_context.cuh"
|
||||
#include "../../include/utils/utils.h"
|
||||
// #include "../utils/mont.cuh"
|
||||
|
||||
extern "C" bool CONCAT_EXPAND(CURVE, eq)(projective_t* point1, projective_t* point2)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
extern "C" void CONCAT_EXPAND(CURVE, to_affine)(projective_t* point, affine_t* point_out)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
extern "C" void CONCAT_EXPAND(CURVE, generate_projective_points)(projective_t* points, int size)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
extern "C" void CONCAT_EXPAND(CURVE, generate_affine_points)(affine_t* points, int size)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
extern "C" int CONCAT_EXPAND(CURVE, affine_convert_montgomery)(
|
||||
affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C" int CONCAT_EXPAND(CURVE, projective_convert_montgomery)(
|
||||
projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
39
icicle/src/curves/extern_g2.cpp
Normal file
39
icicle/src/curves/extern_g2.cpp
Normal file
@@ -0,0 +1,39 @@
|
||||
#include "curves/curve_config.cuh"
|
||||
|
||||
using namespace curve_config;
|
||||
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
#include "utils/utils.h"
|
||||
#include "utils/mont.cuh"
|
||||
|
||||
extern "C" bool CONCAT_EXPAND(CURVE, g2_eq)(g2_projective_t* point1, g2_projective_t* point2)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
extern "C" void CONCAT_EXPAND(CURVE, g2_to_affine)(g2_projective_t* point, g2_affine_t* point_out)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
extern "C" void CONCAT_EXPAND(CURVE, g2_generate_projective_points)(g2_projective_t* points, int size)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
extern "C" void CONCAT_EXPAND(CURVE, g2_generate_affine_points)(g2_affine_t* points, int size)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_affine_convert_montgomery)(
|
||||
g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_projective_convert_montgomery)(
|
||||
g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -1,40 +1,37 @@
|
||||
if (EXT_FIELD)
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DEXT_FIELD")
|
||||
endif ()
|
||||
|
||||
SET(SUPPORTED_FIELDS_WITHOUT_NTT grumpkin)
|
||||
|
||||
set(TARGET icicle_field)
|
||||
|
||||
set(SRC ${CMAKE_SOURCE_DIR}/src)
|
||||
set(SRC ../../)
|
||||
|
||||
set(FIELD_SOURCE ${SRC}/fields/extern.cu)
|
||||
list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern.cu)
|
||||
set(FIELD_SOURCE ${SRC}/fields/extern.cpp)
|
||||
# list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern.cu)
|
||||
if(EXT_FIELD)
|
||||
list(APPEND FIELD_SOURCE ${SRC}/fields/extern_extension.cu)
|
||||
list(APPEND FIELD_SOURCE ${SRC}/ntt/extern_extension.cu)
|
||||
list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern_extension.cu)
|
||||
list(APPEND FIELD_SOURCE ${SRC}/fields/extern_extension.cpp)
|
||||
# list(APPEND FIELD_SOURCE ${SRC}/ntt/extern_extension.cu)
|
||||
# list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern_extension.cu)
|
||||
endif()
|
||||
|
||||
set(POLYNOMIAL_SOURCE_FILES
|
||||
${SRC}/polynomials/polynomials.cu
|
||||
${SRC}/polynomials/cuda_backend/polynomial_cuda_backend.cu
|
||||
${SRC}/polynomials/polynomials_c_api.cu)
|
||||
# set(POLYNOMIAL_SOURCE_FILES
|
||||
# ${SRC}/polynomials/polynomials.cu
|
||||
# ${SRC}/polynomials/cuda_backend/polynomial_cuda_backend.cu
|
||||
# ${SRC}/polynomials/polynomials_c_api.cu)
|
||||
|
||||
list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES})
|
||||
# list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES})
|
||||
|
||||
# TODO: impl poseidon for small fields. note that it needs to be defined over the extension field!
|
||||
if (DEFINED CURVE)
|
||||
list(APPEND FIELD_SOURCE ${SRC}/poseidon/poseidon.cu)
|
||||
list(APPEND FIELD_SOURCE ${SRC}/poseidon/tree/merkle.cu)
|
||||
endif()
|
||||
# if (DEFINED CURVE)
|
||||
# list(APPEND FIELD_SOURCE ${SRC}/poseidon/poseidon.cu)
|
||||
# list(APPEND FIELD_SOURCE ${SRC}/poseidon/tree/merkle.cu)
|
||||
# endif()
|
||||
|
||||
if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
|
||||
list(APPEND FIELD_SOURCE ${SRC}/ntt/extern.cu)
|
||||
list(APPEND FIELD_SOURCE ${SRC}/ntt/kernel_ntt.cu)
|
||||
endif()
|
||||
# if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
|
||||
# list(APPEND FIELD_SOURCE ${SRC}/ntt/extern.cu)
|
||||
# list(APPEND FIELD_SOURCE ${SRC}/ntt/kernel_ntt.cu)
|
||||
# endif()
|
||||
|
||||
add_library(${TARGET} STATIC ${FIELD_SOURCE})
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_field_${FIELD}")
|
||||
target_compile_definitions(${TARGET} PUBLIC FIELD=${FIELD})
|
||||
target_compile_features(${TARGET} PUBLIC cxx_std_17)
|
||||
19
icicle/src/fields/extern.cpp
Normal file
19
icicle/src/fields/extern.cpp
Normal file
@@ -0,0 +1,19 @@
|
||||
#define FIELD_ID BN254
|
||||
#include "../../include/fields/field_config.cuh"
|
||||
|
||||
using namespace field_config;
|
||||
|
||||
//#include "../../include/utils/mont.cuh"
|
||||
#include "../../include/utils/utils.h"
|
||||
#include "../../include/gpu-utils/device_context.cuh"
|
||||
|
||||
extern "C" void CONCAT_EXPAND(FIELD, generate_scalars)(scalar_t* scalars, int size)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
extern "C" int CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(
|
||||
scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
18
icicle/src/fields/extern_extension.cpp
Normal file
18
icicle/src/fields/extern_extension.cpp
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "fields/field_config.cuh"
|
||||
|
||||
using namespace field_config;
|
||||
|
||||
#include "utils/mont.cuh"
|
||||
#include "utils/utils.h"
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
|
||||
extern "C" void CONCAT_EXPAND(FIELD, extension_generate_scalars)(extension_t* scalars, int size)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)(
|
||||
extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
set(TARGET icicle_hash)
|
||||
|
||||
add_library(${TARGET} STATIC keccak/keccak.cu)
|
||||
add_library(${TARGET} STATIC keccak/keccak.cpp)
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_hash")
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_hash")
|
||||
target_compile_features(${TARGET} PUBLIC cxx_std_17)
|
||||
24
icicle/src/hash/keccak/keccak.cpp
Normal file
24
icicle/src/hash/keccak/keccak.cpp
Normal file
@@ -0,0 +1,24 @@
|
||||
#include "../../../include/hash/keccak/keccak.cuh"
|
||||
|
||||
typedef int cudaError_t;
|
||||
|
||||
namespace keccak {
|
||||
template <int C, int D>
|
||||
cudaError_t
|
||||
keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C" cudaError_t
|
||||
keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C" cudaError_t
|
||||
keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
} // namespace keccak
|
||||
@@ -144,14 +144,14 @@ namespace keccak {
|
||||
element ^= rc; \
|
||||
}
|
||||
|
||||
__device__ const uint64_t RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
|
||||
const uint64_t RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
|
||||
0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
|
||||
0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
|
||||
0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
|
||||
0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
|
||||
0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
|
||||
|
||||
__device__ void keccakf(uint64_t s[25])
|
||||
void keccakf(uint64_t s[25])
|
||||
{
|
||||
uint64_t t0, t1, t2, t3, t4;
|
||||
|
||||
|
||||
28
icicle/src/msm/CMakeLists.txt
Normal file
28
icicle/src/msm/CMakeLists.txt
Normal file
@@ -0,0 +1,28 @@
|
||||
set(TARGET icicle_msm)
|
||||
set(CURVE_TARGET icicle_curve)
|
||||
set(FIELD_TARGET icicle_field)
|
||||
|
||||
set(SRC ../)
|
||||
|
||||
set(MSM_SOURCE ${SRC}/msm/extern.cpp)
|
||||
if(G2)
|
||||
list(APPEND MSM_SOURCE ${SRC}/msm/extern_g2.cpp)
|
||||
endif()
|
||||
# if(MSM)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
|
||||
# if(G2)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
|
||||
# endif()
|
||||
# endif()
|
||||
# if(ECNTT)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
|
||||
# endif()
|
||||
|
||||
add_library(${TARGET} STATIC ${MSM_SOURCE})
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
|
||||
target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
|
||||
target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
|
||||
target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
|
||||
target_compile_features(${TARGET} PUBLIC cxx_std_17)
|
||||
46
icicle/src/msm/extern.cpp
Normal file
46
icicle/src/msm/extern.cpp
Normal file
@@ -0,0 +1,46 @@
|
||||
#define CURVE_ID BN254
|
||||
#define FIELD_ID BN254
|
||||
#include "../../include/curves/curve_config.cuh"
|
||||
#include "../../include/fields/field_config.cuh"
|
||||
#include "../../include/gpu-utils/device_context.cuh"
|
||||
#include "../../include/msm/msm.cuh"
|
||||
|
||||
typedef int cudaError_t;
|
||||
using namespace curve_config;
|
||||
using namespace field_config;
|
||||
|
||||
#include "../../include/utils/utils.h"
|
||||
|
||||
namespace msm {
|
||||
/**
|
||||
* Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
|
||||
* template parameters (where the curve is given by `-DCURVE` env variable during build):
|
||||
* - `A` is the [affine representation](@ref affine_t) of curve points;
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, precompute_msm_bases_cuda)(
|
||||
affine_t* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
int _c,
|
||||
bool are_bases_on_device,
|
||||
device_context::DeviceContext& ctx,
|
||||
affine_t* output_bases)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extern "C" version of [msm](@ref msm) function with the following values of template parameters
|
||||
* (where the curve is given by `-DCURVE` env variable during build):
|
||||
* - `S` is the [scalar field](@ref scalar_t) of the curve;
|
||||
* - `A` is the [affine representation](@ref affine_t) of curve points;
|
||||
* - `P` is the [projective representation](@ref projective_t) of curve points.
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, msm_cuda)(
|
||||
const scalar_t* scalars, const affine_t* points, int msm_size, MSMConfig& config, projective_t* out)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
} // namespace msm
|
||||
43
icicle/src/msm/extern_g2.cpp
Normal file
43
icicle/src/msm/extern_g2.cpp
Normal file
@@ -0,0 +1,43 @@
|
||||
#include "curves/curve_config.cuh"
|
||||
#include "fields/field_config.cuh"
|
||||
|
||||
using namespace curve_config;
|
||||
using namespace field_config;
|
||||
|
||||
#include "msm.cu"
|
||||
#include "utils/utils.h"
|
||||
|
||||
namespace msm {
|
||||
/**
|
||||
* Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
|
||||
* template parameters (where the curve is given by `-DCURVE` env variable during build):
|
||||
* - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_precompute_msm_bases_cuda)(
|
||||
g2_affine_t* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
int _c,
|
||||
bool are_bases_on_device,
|
||||
device_context::DeviceContext& ctx,
|
||||
g2_affine_t* output_bases)
|
||||
{
|
||||
return precompute_msm_bases<g2_affine_t, g2_projective_t>(
|
||||
bases, bases_size, precompute_factor, _c, are_bases_on_device, ctx, output_bases);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extern "C" version of [msm](@ref msm) function with the following values of template parameters
|
||||
* (where the curve is given by `-DCURVE` env variable during build):
|
||||
* - `S` is the [scalar field](@ref scalar_t) of the curve;
|
||||
* - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
|
||||
* - `P` is the [projective representation](@ref g2_projective_t) of G2 curve points.
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_msm_cuda)(
|
||||
const scalar_t* scalars, const g2_affine_t* points, int msm_size, MSMConfig& config, g2_projective_t* out)
|
||||
{
|
||||
return msm<scalar_t, g2_affine_t, g2_projective_t>(scalars, points, msm_size, config, out);
|
||||
}
|
||||
} // namespace msm
|
||||
@@ -20,32 +20,32 @@ public:
|
||||
unsigned p = 10;
|
||||
// unsigned p = 1<<30;
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
|
||||
static Dummy_Scalar zero() { return {0}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
|
||||
static Dummy_Scalar one() { return {1}; }
|
||||
|
||||
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
|
||||
friend std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
|
||||
{
|
||||
os << scalar.x;
|
||||
return os;
|
||||
}
|
||||
|
||||
HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
|
||||
unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
|
||||
{
|
||||
return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
|
||||
friend Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
|
||||
{
|
||||
return {(p1.x + p2.x) % p1.p};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
|
||||
friend bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
|
||||
friend bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
|
||||
static HOST_INLINE Dummy_Scalar rand_host()
|
||||
static Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
|
||||
static Dummy_Scalar rand_host()
|
||||
{
|
||||
return {(unsigned)rand() % 10};
|
||||
// return {(unsigned)rand()};
|
||||
@@ -57,32 +57,32 @@ class Dummy_Projective
|
||||
public:
|
||||
Dummy_Scalar x;
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Projective zero() { return {0}; }
|
||||
static Dummy_Projective zero() { return {0}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Projective one() { return {1}; }
|
||||
static Dummy_Projective one() { return {1}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Projective to_affine(const Dummy_Projective& point) { return {point.x}; }
|
||||
static Dummy_Projective to_affine(const Dummy_Projective& point) { return {point.x}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Projective from_affine(const Dummy_Projective& point) { return {point.x}; }
|
||||
static Dummy_Projective from_affine(const Dummy_Projective& point) { return {point.x}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective& point) { return {Dummy_Scalar::neg(point.x)}; }
|
||||
static Dummy_Projective neg(const Dummy_Projective& point) { return {Dummy_Scalar::neg(point.x)}; }
|
||||
|
||||
friend HOST_DEVICE_INLINE Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2)
|
||||
friend Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2)
|
||||
{
|
||||
return {p1.x + p2.x};
|
||||
}
|
||||
|
||||
// friend HOST_DEVICE_INLINE Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) {
|
||||
// friend Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) {
|
||||
// return p1 + neg(p2);
|
||||
// }
|
||||
|
||||
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point)
|
||||
friend std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point)
|
||||
{
|
||||
os << point.x;
|
||||
return os;
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point)
|
||||
friend Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point)
|
||||
{
|
||||
Dummy_Projective res = zero();
|
||||
#ifdef CUDA_ARCH
|
||||
@@ -95,14 +95,14 @@ public:
|
||||
return res;
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2)
|
||||
friend bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2)
|
||||
{
|
||||
return (p1.x == p2.x);
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE bool is_zero(const Dummy_Projective& point) { return point.x == 0; }
|
||||
static bool is_zero(const Dummy_Projective& point) { return point.x == 0; }
|
||||
|
||||
static HOST_INLINE Dummy_Projective rand_host()
|
||||
static Dummy_Projective rand_host()
|
||||
{
|
||||
return {(unsigned)rand() % 10};
|
||||
// return {(unsigned)rand()};
|
||||
|
||||
34
icicle/src/ntt/CMakeLists.txt
Normal file
34
icicle/src/ntt/CMakeLists.txt
Normal file
@@ -0,0 +1,34 @@
|
||||
set(TARGET icicle_ntt)
|
||||
set(CURVE_TARGET icicle_curve)
|
||||
set(FIELD_TARGET icicle_field)
|
||||
|
||||
set(SRC ../)
|
||||
|
||||
set(NTT_SOURCE ${SRC}/ntt/extern.cpp)
|
||||
set(NTT_SOURCE_EXTENSION ${SRC}/ntt/extern_extension.cpp)
|
||||
set(NTT_SOURCE_EC ${SRC}/ntt/extern_ecntt.cpp)
|
||||
set(NTT_SOURCE ${SRC}/ntt/extern.cpp)
|
||||
if(G2)
|
||||
list(APPEND NTT_SOURCE ${SRC}/ntt/extern_g2.cpp)
|
||||
endif()
|
||||
# if(MSM)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
|
||||
# if(G2)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
|
||||
# endif()
|
||||
# endif()
|
||||
# if(ECNTT)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
|
||||
# endif()
|
||||
|
||||
add_library(${TARGET} STATIC ${NTT_SOURCE})
|
||||
add_library(${TARGET} STATIC ${NTT_SOURCE_EXTENSION})
|
||||
add_library(${TARGET} STATIC ${NTT_SOURCE_EC})
|
||||
add_library(${TARGET} STATIC )
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
|
||||
target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
|
||||
target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
|
||||
target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
|
||||
target_compile_features(${TARGET} PUBLIC cxx_std_17)
|
||||
@@ -1,13 +1,21 @@
|
||||
build_verification:
|
||||
mkdir -p work
|
||||
nvcc -o work/test_verification -I. -I../../include tests/verification.cu -std=c++17
|
||||
g++ -o work/test_verification -Intt.cpp -Ikernel_ntt.cpp -Iextern.cpp -Iextern_ecntt.cpp -I ectern_extension.cpp -I../../include tests/verification.cpp -std=c++17
|
||||
|
||||
build_extern:
|
||||
g++ -o work/test_verification -I../../include extern.cpp -std=c++17
|
||||
|
||||
|
||||
extern.o: extern.cpp
|
||||
g++ -std=c++17 -std=gnu++11 -c -o $@ $< -I../../include
|
||||
|
||||
|
||||
test_verification: build_verification
|
||||
work/test_verification
|
||||
|
||||
build_verification_ecntt:
|
||||
mkdir -p work
|
||||
nvcc -o work/test_verification_ecntt -I. -I../../include tests/verification.cu -std=c++17 -DECNTT
|
||||
g++ -o work/test_verification_ecntt -I. -I../../include tests/verification.cpp -std=c++17 -DECNTT
|
||||
|
||||
test_verification_ecntt: build_verification_ecntt
|
||||
work/test_verification_ecntt
|
||||
|
||||
60
icicle/src/ntt/extern.cpp
Normal file
60
icicle/src/ntt/extern.cpp
Normal file
@@ -0,0 +1,60 @@
|
||||
#define FIELD_ID BN254
|
||||
#include "../../include/fields/field_config.cuh"
|
||||
|
||||
using namespace field_config;
|
||||
|
||||
#include "ntt.cpp"
|
||||
|
||||
#include "../../include/gpu-utils/device_context.cuh"
|
||||
#include "../../include/utils/utils.h"
|
||||
|
||||
typedef int cudaError_t;
|
||||
namespace ntt {
|
||||
/**
|
||||
* Extern "C" version of [init_domain](@ref init_domain) function with the following
|
||||
* value of template parameter (where the field is given by `-DFIELD` env variable during build):
|
||||
* - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
|
||||
* stand-alone "STARK field";
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(FIELD, initialize_domain)(
|
||||
scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extern "C" version of [ntt](@ref ntt) function with the following values of template parameters
|
||||
* (where the field is given by `-DFIELD` env variable during build):
|
||||
* - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
|
||||
* stand-alone "STARK field";
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(FIELD, ntt_cuda)(
|
||||
const scalar_t* input, int size, NTTDir dir, NTTConfig<scalar_t>& config, scalar_t* output)
|
||||
{
|
||||
return ntt<scalar_t, scalar_t>(input, size, dir, config, output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extern "C" version of [release_domain](@ref release_domain) function with the following values of template
|
||||
* parameters (where the field is given by `-DFIELD` env variable during build):
|
||||
* - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
|
||||
* stand-alone "STARK field";
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(FIELD, release_domain)(device_context::DeviceContext& ctx)
|
||||
{
|
||||
return release_domain<scalar_t>(ctx);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extern "C" version of [get_root_of_unity](@ref get_root_of_unity) function with the following
|
||||
* value of template parameter (where the field is given by `-DFIELD` env variable during build):
|
||||
* - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
|
||||
* stand-alone "STARK field";
|
||||
*/
|
||||
extern "C" scalar_t CONCAT_EXPAND(FIELD, get_root_of_unity)(uint32_t logn)
|
||||
{
|
||||
return get_root_of_unity<scalar_t>(logn);
|
||||
}
|
||||
} // namespace ntt
|
||||
28
icicle/src/ntt/extern_ecntt.cpp
Normal file
28
icicle/src/ntt/extern_ecntt.cpp
Normal file
@@ -0,0 +1,28 @@
|
||||
|
||||
#define FIELD_ID BN254
|
||||
#define CURVE_ID BN254
|
||||
#include "../../include/curves/curve_config.cuh"
|
||||
#include "../../include/fields/field_config.cuh"
|
||||
|
||||
using namespace curve_config;
|
||||
using namespace field_config;
|
||||
|
||||
#include "ntt.cpp"
|
||||
|
||||
#include "../../include/gpu-utils/device_context.cuh"
|
||||
#include "../../include/utils/utils.h"
|
||||
|
||||
namespace ntt {
|
||||
/**
|
||||
* Extern "C" version of [ntt](@ref ntt) function with the following values of template parameters
|
||||
* (where the curve is given by `-DCURVE` env variable during build):
|
||||
* - `S` is the [projective representation](@ref projective_t) of the curve (i.e. EC NTT is computed);
|
||||
* - `E` is the [scalar field](@ref scalar_t) of the curve;
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, ecntt_cuda)(
|
||||
const projective_t* input, int size, NTTDir dir, NTTConfig<scalar_t>& config, projective_t* output)
|
||||
{
|
||||
return ntt<scalar_t, projective_t>(input, size, dir, config, output);
|
||||
}
|
||||
} // namespace ntt
|
||||
24
icicle/src/ntt/extern_extension.cpp
Normal file
24
icicle/src/ntt/extern_extension.cpp
Normal file
@@ -0,0 +1,24 @@
|
||||
#define FIELD_ID BABY_BEAR
|
||||
#include "../../include/fields/field_config.cuh"
|
||||
|
||||
using namespace field_config;
|
||||
|
||||
#include "ntt.cpp"
|
||||
|
||||
#include "../../include/gpu-utils/device_context.cuh"
|
||||
#include "../../include/utils/utils.h"
|
||||
|
||||
namespace ntt {
|
||||
/**
|
||||
* Extern "C" version of [ntt](@ref ntt) function with the following values of template parameters
|
||||
* (where the field is given by `-DFIELD` env variable during build):
|
||||
* - `E` is the [field](@ref scalar_t);
|
||||
* - `S` is the [extension](@ref extension_t) of `E` of appropriate degree;
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_ntt_cuda)(
|
||||
const extension_t* input, int size, NTTDir dir, NTTConfig<scalar_t>& config, extension_t* output)
|
||||
{
|
||||
return ntt<scalar_t, extension_t>(input, size, dir, config, output);
|
||||
}
|
||||
} // namespace ntt
|
||||
1070
icicle/src/ntt/kernel_ntt.cpp
Normal file
1070
icicle/src/ntt/kernel_ntt.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@@ -8,7 +8,7 @@ using namespace field_config;
|
||||
|
||||
namespace mxntt {
|
||||
|
||||
static inline __device__ uint32_t dig_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw)
|
||||
static inline uint32_t dig_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw)
|
||||
{
|
||||
uint32_t rev_num = 0, temp, dig_len;
|
||||
if (dit) {
|
||||
@@ -31,11 +31,11 @@ namespace mxntt {
|
||||
return rev_num;
|
||||
}
|
||||
|
||||
static inline __device__ uint32_t bit_rev(uint32_t num, uint32_t log_size) { return __brev(num) >> (32 - log_size); }
|
||||
static inline uint32_t bit_rev(uint32_t num, uint32_t log_size) { return __brev(num) >> (32 - log_size); }
|
||||
|
||||
enum eRevType { None, RevToMixedRev, MixedRevToRev, NaturalToMixedRev, NaturalToRev, MixedRevToNatural };
|
||||
|
||||
static __device__ uint32_t generalized_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw, eRevType rev_type)
|
||||
static uint32_t generalized_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw, eRevType rev_type)
|
||||
{
|
||||
switch (rev_type) {
|
||||
case eRevType::RevToMixedRev:
|
||||
|
||||
338
icicle/src/ntt/ntt.cpp
Normal file
338
icicle/src/ntt/ntt.cpp
Normal file
@@ -0,0 +1,338 @@
|
||||
|
||||
|
||||
#define FIELD_ID BN254
|
||||
#include "../../include/fields/field_config.cuh"
|
||||
|
||||
|
||||
using namespace field_config;
|
||||
|
||||
#include "../../include/ntt/ntt.cuh"
|
||||
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <type_traits>
|
||||
|
||||
#include "../../include/gpu-utils/sharedmem.cuh"
|
||||
#include "../../include/utils/utils_kernels.cuh"
|
||||
#include "../../include/utils/utils.h"
|
||||
#include "../../include/ntt/ntt_impl.cuh"
|
||||
#include "../../include/gpu-utils/device_context.cuh"
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#ifdef CURVE_ID
|
||||
#include "../../include/curves/curve_config.cuh"
|
||||
using namespace curve_config;
|
||||
#define IS_ECNTT std::is_same_v<E, projective_t>
|
||||
#else
|
||||
#define IS_ECNTT false
|
||||
#endif
|
||||
|
||||
namespace ntt {
|
||||
|
||||
namespace {
|
||||
// TODO: Set MAX THREADS based on GPU arch
|
||||
const uint32_t MAX_NUM_THREADS = 512; // TODO: hotfix - should be 1024, currently limits shared memory size
|
||||
const uint32_t MAX_THREADS_BATCH = 512;
|
||||
const uint32_t MAX_THREADS_BATCH_ECNTT =
|
||||
128; // TODO: hardcoded - allows (2^18 x 64) ECNTT for sm86, decrease this to allow larger ecntt length, batch
|
||||
// size limited by on-device memory
|
||||
const uint32_t MAX_SHARED_MEM_ELEMENT_SIZE = 32; // TODO: occupancy calculator, hardcoded for sm_86..sm_89
|
||||
const uint32_t MAX_SHARED_MEM = MAX_SHARED_MEM_ELEMENT_SIZE * MAX_NUM_THREADS;
|
||||
|
||||
template <typename E>
|
||||
void reverse_order_kernel(const E* arr, E* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Bit-reverses a batch of input arrays out-of-place inside GPU.
|
||||
* for example: on input array ([a[0],a[1],a[2],a[3]], 4, 2) it returns
|
||||
* [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places).
|
||||
* @param arr_in batch of arrays of some object of type T. Should be on GPU.
|
||||
* @param n length of `arr`.
|
||||
* @param logn log(n).
|
||||
* @param batch_size the size of the batch.
|
||||
* @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into.
|
||||
*/
|
||||
template <typename E>
|
||||
void reverse_order_batch(
|
||||
const E* arr_in, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream, E* arr_out)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Bit-reverses an input array out-of-place inside GPU.
|
||||
* for example: on array ([a[0],a[1],a[2],a[3]], 4, 2) it returns
|
||||
* [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places).
|
||||
* @param arr_in array of some object of type T of size which is a power of 2. Should be on GPU.
|
||||
* @param n length of `arr`.
|
||||
* @param logn log(n).
|
||||
* @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into.
|
||||
*/
|
||||
template <typename E>
|
||||
void reverse_order(const E* arr_in, uint32_t n, uint32_t logn, cudaStream_t stream, E* arr_out)
|
||||
{
|
||||
reverse_order_batch(arr_in, n, logn, 1, stream, arr_out);
|
||||
}
|
||||
|
||||
/**
|
||||
* Cooley-Tuckey NTT.
|
||||
* NOTE! this function assumes that d_twiddles are located in the device memory.
|
||||
* @param arr_in input array of type E (elements).
|
||||
* @param n length of d_arr.
|
||||
* @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
|
||||
* @param n_twiddles length of twiddles, should be negative for intt.
|
||||
* @param max_task max count of parallel tasks.
|
||||
* @param s log2(n) loop index.
|
||||
* @param arr_out buffer for the output.
|
||||
*/
|
||||
template <typename E, typename S>
|
||||
void ntt_template_kernel_shared_rev(
|
||||
const E* __restrict__ arr_in,
|
||||
int n,
|
||||
const S* __restrict__ r_twiddles,
|
||||
int n_twiddles,
|
||||
int max_task,
|
||||
int ss,
|
||||
int logn,
|
||||
E* __restrict__ arr_out)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cooley-Tuckey NTT.
|
||||
* NOTE! this function assumes that d_twiddles are located in the device memory.
|
||||
* @param arr_in input array of type E (elements).
|
||||
* @param n length of d_arr.
|
||||
* @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
|
||||
* @param n_twiddles length of twiddles, should be negative for intt.
|
||||
* @param max_task max count of parallel tasks.
|
||||
* @param s log2(n) loop index.
|
||||
* @param arr_out buffer for the output.
|
||||
*/
|
||||
template <typename E, typename S>
|
||||
void ntt_template_kernel_shared(
|
||||
const E* __restrict__ arr_in,
|
||||
int n,
|
||||
const S* __restrict__ r_twiddles,
|
||||
int n_twiddles,
|
||||
int max_task,
|
||||
int s,
|
||||
int logn,
|
||||
E* __restrict__ arr_out)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cooley-Tukey NTT.
|
||||
* NOTE! this function assumes that d_twiddles are located in the device memory.
|
||||
* @param arr input array of type E (elements).
|
||||
* @param n length of d_arr.
|
||||
* @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
|
||||
* @param n_twiddles length of twiddles, should be negative for intt.
|
||||
* @param max_task max count of parallel tasks.
|
||||
* @param s log2(n) loop index.
|
||||
*/
|
||||
template <typename E, typename S>
|
||||
void
|
||||
ntt_template_kernel(const E* arr_in, int n, S* twiddles, int n_twiddles, int max_task, int s, bool rev, E* arr_out)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* NTT/INTT inplace batch
|
||||
* Note: this function does not perform any bit-reverse permutations on its inputs or outputs.
|
||||
* @param d_input Input array
|
||||
* @param n Size of `d_input`
|
||||
* @param d_twiddles Twiddles
|
||||
* @param n_twiddles Size of `d_twiddles`
|
||||
* @param batch_size The size of the batch; the length of `d_inout` is `n` * `batch_size`.
|
||||
* @param inverse true for iNTT
|
||||
* @param coset should be array of length n or a nullptr if NTT is not computed on a coset
|
||||
* @param stream CUDA stream
|
||||
* @param is_async if false, perform sync of the supplied CUDA stream at the end of processing
|
||||
* @param d_output Output array
|
||||
*/
|
||||
template <typename E, typename S>
|
||||
cudaError_t ntt_inplace_batch_template(
|
||||
const E* d_input,
|
||||
int n,
|
||||
S* d_twiddles,
|
||||
int n_twiddles,
|
||||
int batch_size,
|
||||
int logn,
|
||||
bool inverse,
|
||||
bool dit,
|
||||
S* arbitrary_coset,
|
||||
int coset_gen_index,
|
||||
cudaStream_t stream,
|
||||
E* d_output)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
* @struct Domain
|
||||
* Struct containing information about the domain on which (i)NTT is evaluated i.e. twiddle factors.
|
||||
* Twiddle factors are private, static and can only be set using [init_domain](@ref init_domain) function.
|
||||
* The internal representation of twiddles is prone to change in accordance with changing [NTT](@ref NTT) algorithm.
|
||||
* @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$. Must be a field.
|
||||
*/
|
||||
template <typename S>
|
||||
class Domain
|
||||
{
|
||||
// Mutex for protecting access to the domain/device container array
|
||||
static inline std::mutex device_domain_mutex;
|
||||
// The domain-per-device container - assumption is init_domain is called once per device per program.
|
||||
|
||||
int max_size = 0;
|
||||
int max_log_size = 0;
|
||||
S* twiddles = nullptr;
|
||||
bool initialized = false; // protection for multi-threaded case
|
||||
std::unordered_map<S, int> coset_index = {};
|
||||
|
||||
S* internal_twiddles = nullptr; // required by mixed-radix NTT
|
||||
S* basic_twiddles = nullptr; // required by mixed-radix NTT
|
||||
|
||||
// mixed-radix NTT supports a fast-twiddle option at the cost of additional 4N memory (where N is max NTT size)
|
||||
S* fast_external_twiddles = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
|
||||
S* fast_internal_twiddles = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
|
||||
S* fast_basic_twiddles = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
|
||||
S* fast_external_twiddles_inv = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
|
||||
S* fast_internal_twiddles_inv = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
|
||||
S* fast_basic_twiddles_inv = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
|
||||
|
||||
public:
|
||||
template <typename U>
|
||||
friend cudaError_t init_domain(U primitive_root, device_context::DeviceContext& ctx, bool fast_tw);
|
||||
|
||||
template <typename U>
|
||||
friend cudaError_t release_domain(device_context::DeviceContext& ctx);
|
||||
|
||||
template <typename U>
|
||||
friend U get_root_of_unity(uint64_t logn, device_context::DeviceContext& ctx);
|
||||
|
||||
template <typename U>
|
||||
friend U get_root_of_unity_from_domain(uint64_t logn, device_context::DeviceContext& ctx);
|
||||
|
||||
template <typename U, typename E>
|
||||
friend cudaError_t ntt(const E* input, int size, NTTDir dir, NTTConfig<U>& config, E* output);
|
||||
};
|
||||
|
||||
template <typename S>
|
||||
// static inline Domain<S> domains_for_devices[device_context::MAX_DEVICES] = {};
|
||||
static inline Domain<S> domains_for_devices[1] = {};
|
||||
|
||||
template <typename S>
|
||||
cudaError_t init_domain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
cudaError_t release_domain(device_context::DeviceContext& ctx)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
S get_root_of_unity(uint64_t max_size)
|
||||
{
|
||||
// ceil up
|
||||
const auto log_max_size = static_cast<uint32_t>(std::ceil(std::log2(max_size)));
|
||||
return S::omega(log_max_size);
|
||||
}
|
||||
// explicit instantiation to avoid having to include this file
|
||||
template scalar_t get_root_of_unity(uint64_t logn);
|
||||
|
||||
template <typename S>
|
||||
S get_root_of_unity_from_domain(uint64_t logn, device_context::DeviceContext& ctx)
|
||||
{
|
||||
Domain<S>& domain = domains_for_devices<S>[ctx.device_id];
|
||||
if (logn > domain.max_log_size) {
|
||||
std::ostringstream oss;
|
||||
oss << "NTT log_size=" << logn
|
||||
<< " is too large for the domain. Consider generating your domain with a higher order root of unity.\n";
|
||||
THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, oss.str().c_str());
|
||||
}
|
||||
const size_t twiddles_idx = 1ULL << (domain.max_log_size - logn);
|
||||
return domain.twiddles[twiddles_idx];
|
||||
}
|
||||
// explicit instantiation to avoid having to include this file
|
||||
template scalar_t get_root_of_unity_from_domain(uint64_t logn, device_context::DeviceContext& ctx);
|
||||
|
||||
template <typename S>
|
||||
static bool is_choosing_radix2_algorithm(int logn, int batch_size, const NTTConfig<S>& config)
|
||||
{
|
||||
const bool is_mixed_radix_alg_supported = (logn > 3 && logn != 7);
|
||||
if (!is_mixed_radix_alg_supported && config.columns_batch)
|
||||
throw IcicleError(IcicleError_t::InvalidArgument, "columns batch is not supported for given NTT size");
|
||||
const bool is_user_selected_radix2_alg = config.ntt_algorithm == NttAlgorithm::Radix2;
|
||||
const bool is_force_radix2 = !is_mixed_radix_alg_supported || is_user_selected_radix2_alg;
|
||||
if (is_force_radix2) return true;
|
||||
|
||||
const bool is_user_selected_mixed_radix_alg = config.ntt_algorithm == NttAlgorithm::MixedRadix;
|
||||
if (is_user_selected_mixed_radix_alg) return false;
|
||||
if (config.columns_batch) return false; // radix2 does not currently support columns batch mode.
|
||||
|
||||
// Heuristic to automatically select an algorithm
|
||||
// Note that generally the decision depends on {logn, batch, ordering, inverse, coset, in-place, coeff-field} and
|
||||
// the specific GPU.
|
||||
// the following heuristic is a simplification based on measurements. Users can try both and select the algorithm
|
||||
// based on the specific case via the 'NTTConfig.ntt_algorithm' field
|
||||
|
||||
if (logn >= 16) return false; // mixed-radix is typically faster in those cases
|
||||
if (logn <= 11) return true; // radix-2 is typically faster for batch<=256 in those cases
|
||||
const int log_batch = (int)log2(batch_size);
|
||||
return (logn + log_batch <= 18); // almost the cutoff point where both are equal
|
||||
}
|
||||
|
||||
template <typename S, typename E>
|
||||
cudaError_t radix2_ntt(
|
||||
const E* d_input,
|
||||
E* d_output,
|
||||
S* twiddles,
|
||||
int ntt_size,
|
||||
int max_size,
|
||||
int batch_size,
|
||||
bool is_inverse,
|
||||
Ordering ordering,
|
||||
S* arbitrary_coset,
|
||||
int coset_gen_index,
|
||||
cudaStream_t cuda_stream)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename S, typename E>
|
||||
cudaError_t ntt(const E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
NTTConfig<S> default_ntt_config(const device_context::DeviceContext& ctx)
|
||||
{
|
||||
NTTConfig<S> config = {
|
||||
ctx, // ctx
|
||||
S::one(), // coset_gen
|
||||
1, // batch_size
|
||||
false, // columns_batch
|
||||
Ordering::kNN, // ordering
|
||||
false, // are_inputs_on_device
|
||||
false, // are_outputs_on_device
|
||||
false, // is_async
|
||||
NttAlgorithm::Auto, // ntt_algorithm
|
||||
};
|
||||
return config;
|
||||
}
|
||||
// explicit instantiation to avoid having to include this file
|
||||
template NTTConfig<scalar_t> default_ntt_config(const device_context::DeviceContext& ctx);
|
||||
} // namespace ntt
|
||||
46
icicle/src/ntt/tests/verification.cpp
Normal file
46
icicle/src/ntt/tests/verification.cpp
Normal file
@@ -0,0 +1,46 @@
|
||||
#include "../../../include/fields/id.h"
|
||||
#define FIELD_ID BN254
|
||||
|
||||
#ifdef ECNTT
|
||||
#define CURVE_ID BN254
|
||||
#include "../../../include/curves/curve_config.cuh"
|
||||
typedef field_config::scalar_t test_scalar;
|
||||
typedef curve_config::projective_t test_data;
|
||||
#else
|
||||
#include "../../../include/fields/field_config.cuh"
|
||||
typedef field_config::scalar_t test_scalar;
|
||||
typedef field_config::scalar_t test_data;
|
||||
#endif
|
||||
|
||||
#include "../../../include/fields/field.cuh"
|
||||
#include "../../../include/curves/projective.cuh"
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "../ntt.cpp"
|
||||
// #include "../kernel_ntt.cpp"
|
||||
#include <memory>
|
||||
|
||||
void random_samples(test_data* res, uint32_t count)
|
||||
{
|
||||
for (int i = 0; i < count; i++)
|
||||
res[i] = i < 1000 ? test_data::rand_host() : res[i - 1000];
|
||||
}
|
||||
|
||||
void incremental_values(test_scalar* res, uint32_t count)
|
||||
{
|
||||
for (int i = 0; i < count; i++) {
|
||||
res[i] = i ? res[i - 1] + test_scalar::one() : test_scalar::zero();
|
||||
}
|
||||
}
|
||||
|
||||
void transpose_batch(test_scalar* in, test_scalar* out, int row_size, int column_size)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -1,207 +0,0 @@
|
||||
#include "fields/id.h"
|
||||
#define FIELD_ID BN254
|
||||
|
||||
#ifdef ECNTT
|
||||
#define CURVE_ID BN254
|
||||
#include "curves/curve_config.cuh"
|
||||
typedef field_config::scalar_t test_scalar;
|
||||
typedef curve_config::projective_t test_data;
|
||||
#else
|
||||
#include "fields/field_config.cuh"
|
||||
typedef field_config::scalar_t test_scalar;
|
||||
typedef field_config::scalar_t test_data;
|
||||
#endif
|
||||
|
||||
#include "fields/field.cuh"
|
||||
#include "curves/projective.cuh"
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ntt.cu"
|
||||
#include "kernel_ntt.cu"
|
||||
#include <memory>
|
||||
|
||||
void random_samples(test_data* res, uint32_t count)
|
||||
{
|
||||
for (int i = 0; i < count; i++)
|
||||
res[i] = i < 1000 ? test_data::rand_host() : res[i - 1000];
|
||||
}
|
||||
|
||||
void incremental_values(test_scalar* res, uint32_t count)
|
||||
{
|
||||
for (int i = 0; i < count; i++) {
|
||||
res[i] = i ? res[i - 1] + test_scalar::one() : test_scalar::zero();
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void transpose_batch(test_scalar* in, test_scalar* out, int row_size, int column_size)
|
||||
{
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid >= row_size * column_size) return;
|
||||
out[(tid % row_size) * column_size + (tid / row_size)] = in[tid];
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
cudaEvent_t icicle_start, icicle_stop, new_start, new_stop;
|
||||
float icicle_time, new_time;
|
||||
|
||||
int NTT_LOG_SIZE = (argc > 1) ? atoi(argv[1]) : 19;
|
||||
int NTT_SIZE = 1 << NTT_LOG_SIZE;
|
||||
bool INPLACE = (argc > 2) ? atoi(argv[2]) : false;
|
||||
int INV = (argc > 3) ? atoi(argv[3]) : false;
|
||||
int BATCH_SIZE = (argc > 4) ? atoi(argv[4]) : 150;
|
||||
bool COLUMNS_BATCH = (argc > 5) ? atoi(argv[5]) : false;
|
||||
int COSET_IDX = (argc > 6) ? atoi(argv[6]) : 2;
|
||||
const ntt::Ordering ordering = (argc > 7) ? ntt::Ordering(atoi(argv[7])) : ntt::Ordering::kNN;
|
||||
bool FAST_TW = (argc > 8) ? atoi(argv[8]) : true;
|
||||
|
||||
// Note: NM, MN are not expected to be equal when comparing mixed-radix and radix-2 NTTs
|
||||
const char* ordering_str = ordering == ntt::Ordering::kNN ? "NN"
|
||||
: ordering == ntt::Ordering::kNR ? "NR"
|
||||
: ordering == ntt::Ordering::kRN ? "RN"
|
||||
: ordering == ntt::Ordering::kRR ? "RR"
|
||||
: ordering == ntt::Ordering::kNM ? "NM"
|
||||
: "MN";
|
||||
|
||||
printf(
|
||||
"running ntt 2^%d, inplace=%d, inverse=%d, batch_size=%d, columns_batch=%d coset-idx=%d, ordering=%s, fast_tw=%d\n",
|
||||
NTT_LOG_SIZE, INPLACE, INV, BATCH_SIZE, COLUMNS_BATCH, COSET_IDX, ordering_str, FAST_TW);
|
||||
|
||||
CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context (warmup)
|
||||
|
||||
// init domain
|
||||
auto ntt_config = ntt::default_ntt_config<test_scalar>();
|
||||
ntt_config.ordering = ordering;
|
||||
ntt_config.are_inputs_on_device = true;
|
||||
ntt_config.are_outputs_on_device = true;
|
||||
ntt_config.batch_size = BATCH_SIZE;
|
||||
ntt_config.columns_batch = COLUMNS_BATCH;
|
||||
|
||||
CHK_IF_RETURN(cudaEventCreate(&icicle_start));
|
||||
CHK_IF_RETURN(cudaEventCreate(&icicle_stop));
|
||||
CHK_IF_RETURN(cudaEventCreate(&new_start));
|
||||
CHK_IF_RETURN(cudaEventCreate(&new_stop));
|
||||
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
const scalar_t basic_root = test_scalar::omega(NTT_LOG_SIZE);
|
||||
ntt::init_domain(basic_root, ntt_config.ctx, FAST_TW);
|
||||
auto stop = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
|
||||
std::cout << "initDomain took: " << duration / 1000 << " MS" << std::endl;
|
||||
|
||||
// cpu allocation
|
||||
auto CpuScalars = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
|
||||
auto CpuOutputOld = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
|
||||
auto CpuOutputNew = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
|
||||
|
||||
// gpu allocation
|
||||
scalar_t *GpuScalars, *GpuOutputOld, *GpuOutputNew;
|
||||
scalar_t* GpuScalarsTransposed;
|
||||
CHK_IF_RETURN(cudaMalloc(&GpuScalars, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
|
||||
CHK_IF_RETURN(cudaMalloc(&GpuScalarsTransposed, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
|
||||
CHK_IF_RETURN(cudaMalloc(&GpuOutputOld, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
|
||||
CHK_IF_RETURN(cudaMalloc(&GpuOutputNew, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
|
||||
|
||||
// init inputs
|
||||
// incremental_values(CpuScalars.get(), NTT_SIZE * BATCH_SIZE);
|
||||
random_samples(CpuScalars.get(), NTT_SIZE * BATCH_SIZE);
|
||||
CHK_IF_RETURN(
|
||||
cudaMemcpy(GpuScalars, CpuScalars.get(), NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyHostToDevice));
|
||||
|
||||
if (COLUMNS_BATCH) {
|
||||
transpose_batch<<<(NTT_SIZE * BATCH_SIZE + 256 - 1) / 256, 256>>>(
|
||||
GpuScalars, GpuScalarsTransposed, NTT_SIZE, BATCH_SIZE);
|
||||
}
|
||||
|
||||
// inplace
|
||||
if (INPLACE) {
|
||||
CHK_IF_RETURN(cudaMemcpy(
|
||||
GpuOutputNew, COLUMNS_BATCH ? GpuScalarsTransposed : GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
for (int coset_idx = 0; coset_idx < COSET_IDX; ++coset_idx) {
|
||||
ntt_config.coset_gen = ntt_config.coset_gen * basic_root;
|
||||
}
|
||||
|
||||
auto benchmark = [&](bool is_print, int iterations) -> cudaError_t {
|
||||
// NEW
|
||||
CHK_IF_RETURN(cudaEventRecord(new_start, ntt_config.ctx.stream));
|
||||
ntt_config.ntt_algorithm = ntt::NttAlgorithm::MixedRadix;
|
||||
for (size_t i = 0; i < iterations; i++) {
|
||||
CHK_IF_RETURN(ntt::ntt(
|
||||
INPLACE ? GpuOutputNew
|
||||
: COLUMNS_BATCH ? GpuScalarsTransposed
|
||||
: GpuScalars,
|
||||
NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config, GpuOutputNew));
|
||||
}
|
||||
CHK_IF_RETURN(cudaEventRecord(new_stop, ntt_config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaEventElapsedTime(&new_time, new_start, new_stop));
|
||||
|
||||
// OLD
|
||||
CHK_IF_RETURN(cudaEventRecord(icicle_start, ntt_config.ctx.stream));
|
||||
ntt_config.ntt_algorithm = ntt::NttAlgorithm::Radix2;
|
||||
for (size_t i = 0; i < iterations; i++) {
|
||||
CHK_IF_RETURN(
|
||||
ntt::ntt(GpuScalars, NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config, GpuOutputOld));
|
||||
}
|
||||
CHK_IF_RETURN(cudaEventRecord(icicle_stop, ntt_config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaEventElapsedTime(&icicle_time, icicle_start, icicle_stop));
|
||||
|
||||
if (is_print) {
|
||||
printf("Old Runtime=%0.3f MS\n", icicle_time / iterations);
|
||||
printf("New Runtime=%0.3f MS\n", new_time / iterations);
|
||||
}
|
||||
|
||||
return CHK_LAST();
|
||||
};
|
||||
|
||||
CHK_IF_RETURN(benchmark(false /*=print*/, 1)); // warmup
|
||||
int count = INPLACE ? 1 : 10;
|
||||
if (INPLACE) {
|
||||
CHK_IF_RETURN(cudaMemcpy(
|
||||
GpuOutputNew, COLUMNS_BATCH ? GpuScalarsTransposed : GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
CHK_IF_RETURN(benchmark(true /*=print*/, count));
|
||||
|
||||
if (COLUMNS_BATCH) {
|
||||
transpose_batch<<<(NTT_SIZE * BATCH_SIZE + 256 - 1) / 256, 256>>>(
|
||||
GpuOutputNew, GpuScalarsTransposed, BATCH_SIZE, NTT_SIZE);
|
||||
CHK_IF_RETURN(cudaMemcpy(
|
||||
GpuOutputNew, GpuScalarsTransposed, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
// verify
|
||||
CHK_IF_RETURN(
|
||||
cudaMemcpy(CpuOutputNew.get(), GpuOutputNew, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToHost));
|
||||
CHK_IF_RETURN(
|
||||
cudaMemcpy(CpuOutputOld.get(), GpuOutputOld, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToHost));
|
||||
|
||||
bool success = true;
|
||||
for (int i = 0; i < NTT_SIZE * BATCH_SIZE; i++) {
|
||||
// if (i%64==0) printf("\n");
|
||||
if (CpuOutputNew[i] != CpuOutputOld[i]) {
|
||||
success = false;
|
||||
// std::cout << i << " ref " << CpuOutputOld[i] << " != " << CpuOutputNew[i] << std::endl;
|
||||
// break;
|
||||
} else {
|
||||
// std::cout << i << " ref " << CpuOutputOld[i] << " == " << CpuOutputNew[i] << std::endl;
|
||||
// break;
|
||||
}
|
||||
}
|
||||
const char* success_str = success ? "SUCCESS!" : "FAIL!";
|
||||
printf("%s\n", success_str);
|
||||
|
||||
CHK_IF_RETURN(cudaFree(GpuScalars));
|
||||
CHK_IF_RETURN(cudaFree(GpuOutputOld));
|
||||
CHK_IF_RETURN(cudaFree(GpuOutputNew));
|
||||
|
||||
ntt::release_domain<test_scalar>(ntt_config.ctx);
|
||||
|
||||
return CHK_LAST();
|
||||
}
|
||||
@@ -1,721 +0,0 @@
|
||||
#ifndef T_NTT
|
||||
#define T_NTT
|
||||
#pragma once
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include "gpu-utils/modifiers.cuh"
|
||||
|
||||
struct stage_metadata {
|
||||
uint32_t th_stride;
|
||||
uint32_t ntt_block_size;
|
||||
uint32_t batch_id;
|
||||
uint32_t ntt_block_id;
|
||||
uint32_t ntt_inp_id;
|
||||
};
|
||||
|
||||
#define STAGE_SIZES_DATA \
|
||||
{ \
|
||||
{0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0}, \
|
||||
{6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \
|
||||
{6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0}, \
|
||||
{6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \
|
||||
{6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6}, \
|
||||
{6, 6, 6, 6, 6}, \
|
||||
}
|
||||
uint32_t constexpr STAGE_SIZES_HOST[31][5] = STAGE_SIZES_DATA;
|
||||
__device__ constexpr uint32_t STAGE_SIZES_DEVICE[31][5] = STAGE_SIZES_DATA;
|
||||
|
||||
// construction for fast-twiddles
|
||||
uint32_t constexpr STAGE_PREV_SIZES[31] = {0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 5, 6, 6, 9, 9, 10,
|
||||
11, 11, 12, 15, 15, 16, 16, 18, 18, 20, 21, 21, 22, 23, 24};
|
||||
|
||||
#define STAGE_SIZES_DATA_FAST_TW \
|
||||
{ \
|
||||
{0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0}, \
|
||||
{6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \
|
||||
{6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0}, \
|
||||
{6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \
|
||||
{6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6}, \
|
||||
{6, 6, 6, 6, 6}, \
|
||||
}
|
||||
uint32_t constexpr STAGE_SIZES_HOST_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;
|
||||
__device__ uint32_t constexpr STAGE_SIZES_DEVICE_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;
|
||||
|
||||
template <typename E, typename S>
|
||||
class NTTEngine
|
||||
{
|
||||
public:
|
||||
E X[8];
|
||||
S WB[3];
|
||||
S WI[7];
|
||||
S WE[8];
|
||||
|
||||
DEVICE_INLINE void loadBasicTwiddles(S* basic_twiddles)
|
||||
{
|
||||
UNROLL
|
||||
for (int i = 0; i < 3; i++) {
|
||||
WB[i] = basic_twiddles[i];
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadBasicTwiddlesGeneric(S* basic_twiddles, bool inv)
|
||||
{
|
||||
UNROLL
|
||||
for (int i = 0; i < 3; i++) {
|
||||
WB[i] = basic_twiddles[inv ? i + 3 : i];
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadInternalTwiddles64(S* data, bool stride)
|
||||
{
|
||||
UNROLL
|
||||
for (int i = 0; i < 7; i++) {
|
||||
WI[i] = data[((stride ? (threadIdx.x >> 3) : (threadIdx.x)) & 0x7) * (i + 1)];
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadInternalTwiddles32(S* data, bool stride)
|
||||
{
|
||||
UNROLL
|
||||
for (int i = 0; i < 7; i++) {
|
||||
WI[i] = data[2 * ((stride ? (threadIdx.x >> 4) : (threadIdx.x)) & 0x3) * (i + 1)];
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadInternalTwiddles16(S* data, bool stride)
|
||||
{
|
||||
UNROLL
|
||||
for (int i = 0; i < 7; i++) {
|
||||
WI[i] = data[4 * ((stride ? (threadIdx.x >> 5) : (threadIdx.x)) & 0x1) * (i + 1)];
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadInternalTwiddlesGeneric64(S* data, bool stride, bool inv)
|
||||
{
|
||||
UNROLL
|
||||
for (int i = 0; i < 7; i++) {
|
||||
uint32_t exp = ((stride ? (threadIdx.x >> 3) : (threadIdx.x)) & 0x7) * (i + 1);
|
||||
WI[i] = data[(inv && exp) ? 64 - exp : exp]; // if exp = 0 we also take exp and not 64-exp
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadInternalTwiddlesGeneric32(S* data, bool stride, bool inv)
|
||||
{
|
||||
UNROLL
|
||||
for (int i = 0; i < 7; i++) {
|
||||
uint32_t exp = 2 * ((stride ? (threadIdx.x >> 4) : (threadIdx.x)) & 0x3) * (i + 1);
|
||||
WI[i] = data[(inv && exp) ? 64 - exp : exp];
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadInternalTwiddlesGeneric16(S* data, bool stride, bool inv)
|
||||
{
|
||||
UNROLL
|
||||
for (int i = 0; i < 7; i++) {
|
||||
uint32_t exp = 4 * ((stride ? (threadIdx.x >> 5) : (threadIdx.x)) & 0x1) * (i + 1);
|
||||
WI[i] = data[(inv && exp) ? 64 - exp : exp];
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadExternalTwiddles64(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
|
||||
{
|
||||
data += tw_order * s_meta.ntt_inp_id + (s_meta.ntt_block_id & (tw_order - 1));
|
||||
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
WE[i] = data[8 * i * tw_order + (1 << tw_log_order + 6) - 1];
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadExternalTwiddles32(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
|
||||
{
|
||||
data += tw_order * s_meta.ntt_inp_id * 2 + (s_meta.ntt_block_id & (tw_order - 1));
|
||||
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 2; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 4; i++) {
|
||||
WE[4 * j + i] = data[(8 * i + j) * tw_order + (1 << tw_log_order + 5) - 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadExternalTwiddles16(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
|
||||
{
|
||||
data += tw_order * s_meta.ntt_inp_id * 4 + (s_meta.ntt_block_id & (tw_order - 1));
|
||||
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 2; i++) {
|
||||
WE[2 * j + i] = data[(8 * i + j) * tw_order + (1 << tw_log_order + 4) - 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadExternalTwiddlesGeneric64(
|
||||
S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
|
||||
{
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
uint32_t exp = (s_meta.ntt_inp_id + 8 * i) * (s_meta.ntt_block_id & (tw_order - 1))
|
||||
<< (tw_log_size - tw_log_order - 6);
|
||||
WE[i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadExternalTwiddlesGeneric32(
|
||||
S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
|
||||
{
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 2; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 4; i++) {
|
||||
uint32_t exp = (s_meta.ntt_inp_id * 2 + 8 * i + j) * (s_meta.ntt_block_id & (tw_order - 1))
|
||||
<< (tw_log_size - tw_log_order - 5);
|
||||
WE[4 * j + i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadExternalTwiddlesGeneric16(
|
||||
S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
|
||||
{
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 2; i++) {
|
||||
uint32_t exp = (s_meta.ntt_inp_id * 4 + 8 * i + j) * (s_meta.ntt_block_id & (tw_order - 1))
|
||||
<< (tw_log_size - tw_log_order - 4);
|
||||
WE[2 * j + i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void
|
||||
loadGlobalData(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
|
||||
{
|
||||
if (strided) {
|
||||
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
|
||||
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
|
||||
} else {
|
||||
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
|
||||
}
|
||||
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
X[i] = data[s_meta.th_stride * i * data_stride];
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadGlobalDataColumnBatch(
|
||||
const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
|
||||
{
|
||||
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
|
||||
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
|
||||
batch_size +
|
||||
s_meta.batch_id;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
X[i] = data[s_meta.th_stride * i * data_stride * batch_size];
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void
|
||||
storeGlobalData(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
|
||||
{
|
||||
if (strided) {
|
||||
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
|
||||
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
|
||||
} else {
|
||||
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
|
||||
}
|
||||
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
data[s_meta.th_stride * i * data_stride] = X[i];
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void storeGlobalDataColumnBatch(
|
||||
E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
|
||||
{
|
||||
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
|
||||
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
|
||||
batch_size +
|
||||
s_meta.batch_id;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
data[s_meta.th_stride * i * data_stride * batch_size] = X[i];
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void
|
||||
loadGlobalData32(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
|
||||
{
|
||||
if (strided) {
|
||||
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
|
||||
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
|
||||
} else {
|
||||
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
|
||||
}
|
||||
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 2; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 4; i++) {
|
||||
X[4 * j + i] = data[(8 * i + j) * data_stride];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadGlobalData32ColumnBatch(
|
||||
const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
|
||||
{
|
||||
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
|
||||
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
|
||||
batch_size +
|
||||
s_meta.batch_id;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 2; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 4; i++) {
|
||||
X[4 * j + i] = data[(8 * i + j) * data_stride * batch_size];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void
|
||||
storeGlobalData32(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
|
||||
{
|
||||
if (strided) {
|
||||
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
|
||||
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
|
||||
} else {
|
||||
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
|
||||
}
|
||||
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 2; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 4; i++) {
|
||||
data[(8 * i + j) * data_stride] = X[4 * j + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void storeGlobalData32ColumnBatch(
|
||||
E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
|
||||
{
|
||||
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
|
||||
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
|
||||
batch_size +
|
||||
s_meta.batch_id;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 2; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 4; i++) {
|
||||
data[(8 * i + j) * data_stride * batch_size] = X[4 * j + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void
|
||||
loadGlobalData16(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
|
||||
{
|
||||
if (strided) {
|
||||
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
|
||||
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
|
||||
} else {
|
||||
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
|
||||
}
|
||||
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 2; i++) {
|
||||
X[2 * j + i] = data[(8 * i + j) * data_stride];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void loadGlobalData16ColumnBatch(
|
||||
const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
|
||||
{
|
||||
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
|
||||
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
|
||||
batch_size +
|
||||
s_meta.batch_id;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 2; i++) {
|
||||
X[2 * j + i] = data[(8 * i + j) * data_stride * batch_size];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void
|
||||
storeGlobalData16(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
|
||||
{
|
||||
if (strided) {
|
||||
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
|
||||
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
|
||||
} else {
|
||||
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
|
||||
}
|
||||
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 2; i++) {
|
||||
data[(8 * i + j) * data_stride] = X[2 * j + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void storeGlobalData16ColumnBatch(
|
||||
E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
|
||||
{
|
||||
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
|
||||
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
|
||||
batch_size +
|
||||
s_meta.batch_id;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 2; i++) {
|
||||
data[(8 * i + j) * data_stride * batch_size] = X[2 * j + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void ntt4_2()
|
||||
{
|
||||
UNROLL
|
||||
for (int i = 0; i < 2; i++) {
|
||||
ntt4(X[4 * i], X[4 * i + 1], X[4 * i + 2], X[4 * i + 3]);
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void ntt2_4()
|
||||
{
|
||||
UNROLL
|
||||
for (int i = 0; i < 4; i++) {
|
||||
ntt2(X[2 * i], X[2 * i + 1]);
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void ntt2(E& X0, E& X1)
|
||||
{
|
||||
E T;
|
||||
|
||||
T = X0 + X1;
|
||||
X1 = X0 - X1;
|
||||
X0 = T;
|
||||
}
|
||||
|
||||
DEVICE_INLINE void ntt4(E& X0, E& X1, E& X2, E& X3)
|
||||
{
|
||||
E T;
|
||||
|
||||
T = X0 + X2;
|
||||
X2 = X0 - X2;
|
||||
X0 = X1 + X3;
|
||||
X1 = X1 - X3; // T has X0, X0 has X1, X2 has X2, X1 has X3
|
||||
|
||||
X1 = X1 * WB[0];
|
||||
|
||||
X3 = X2 - X1;
|
||||
X1 = X2 + X1;
|
||||
X2 = T - X0;
|
||||
X0 = T + X0;
|
||||
}
|
||||
|
||||
// rbo version
|
||||
DEVICE_INLINE void ntt4rbo(E& X0, E& X1, E& X2, E& X3)
|
||||
{
|
||||
E T;
|
||||
|
||||
T = X0 - X1;
|
||||
X0 = X0 + X1;
|
||||
X1 = X2 + X3;
|
||||
X3 = X2 - X3; // T has X0, X0 has X1, X2 has X2, X1 has X3
|
||||
|
||||
X3 = X3 * WB[0];
|
||||
|
||||
X2 = X0 - X1;
|
||||
X0 = X0 + X1;
|
||||
X1 = T + X3;
|
||||
X3 = T - X3;
|
||||
}
|
||||
|
||||
DEVICE_INLINE void ntt8(E& X0, E& X1, E& X2, E& X3, E& X4, E& X5, E& X6, E& X7)
|
||||
{
|
||||
E T;
|
||||
|
||||
// out of 56,623,104 possible mappings, we have:
|
||||
T = X3 - X7;
|
||||
X7 = X3 + X7;
|
||||
X3 = X1 - X5;
|
||||
X5 = X1 + X5;
|
||||
X1 = X2 + X6;
|
||||
X2 = X2 - X6;
|
||||
X6 = X0 + X4;
|
||||
X0 = X0 - X4;
|
||||
|
||||
T = T * WB[1];
|
||||
X2 = X2 * WB[1];
|
||||
|
||||
X4 = X6 + X1;
|
||||
X6 = X6 - X1;
|
||||
X1 = X3 + T;
|
||||
X3 = X3 - T;
|
||||
T = X5 + X7;
|
||||
X5 = X5 - X7;
|
||||
X7 = X0 + X2;
|
||||
X0 = X0 - X2;
|
||||
|
||||
X1 = X1 * WB[0];
|
||||
X5 = X5 * WB[1];
|
||||
X3 = X3 * WB[2];
|
||||
|
||||
X2 = X6 + X5;
|
||||
X6 = X6 - X5;
|
||||
X5 = X7 - X1;
|
||||
X1 = X7 + X1;
|
||||
X7 = X0 - X3;
|
||||
X3 = X0 + X3;
|
||||
X0 = X4 + T;
|
||||
X4 = X4 - T;
|
||||
}
|
||||
|
||||
DEVICE_INLINE void ntt8win()
|
||||
{
|
||||
E T;
|
||||
|
||||
T = X[3] - X[7];
|
||||
X[7] = X[3] + X[7];
|
||||
X[3] = X[1] - X[5];
|
||||
X[5] = X[1] + X[5];
|
||||
X[1] = X[2] + X[6];
|
||||
X[2] = X[2] - X[6];
|
||||
X[6] = X[0] + X[4];
|
||||
X[0] = X[0] - X[4];
|
||||
|
||||
X[2] = X[2] * WB[0];
|
||||
|
||||
X[4] = X[6] + X[1];
|
||||
X[6] = X[6] - X[1];
|
||||
X[1] = X[3] + T;
|
||||
X[3] = X[3] - T;
|
||||
T = X[5] + X[7];
|
||||
X[5] = X[5] - X[7];
|
||||
X[7] = X[0] + X[2];
|
||||
X[0] = X[0] - X[2];
|
||||
|
||||
X[1] = X[1] * WB[1];
|
||||
X[5] = X[5] * WB[0];
|
||||
X[3] = X[3] * WB[2];
|
||||
|
||||
X[2] = X[6] + X[5];
|
||||
X[6] = X[6] - X[5];
|
||||
|
||||
X[5] = X[1] + X[3];
|
||||
X[3] = X[1] - X[3];
|
||||
|
||||
X[1] = X[7] + X[5];
|
||||
X[5] = X[7] - X[5];
|
||||
X[7] = X[0] - X[3];
|
||||
X[3] = X[0] + X[3];
|
||||
X[0] = X[4] + T;
|
||||
X[4] = X[4] - T;
|
||||
}
|
||||
|
||||
DEVICE_INLINE void SharedData64Columns8(E* shmem, bool store, bool high_bits, bool stride)
|
||||
{
|
||||
uint32_t ntt_id = stride ? threadIdx.x & 0x7 : threadIdx.x >> 3;
|
||||
uint32_t column_id = stride ? threadIdx.x >> 3 : threadIdx.x & 0x7;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
if (store) {
|
||||
shmem[ntt_id * 64 + i * 8 + column_id] = X[i];
|
||||
} else {
|
||||
X[i] = shmem[ntt_id * 64 + i * 8 + column_id];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void SharedData64Rows8(E* shmem, bool store, bool high_bits, bool stride)
|
||||
{
|
||||
uint32_t ntt_id = stride ? threadIdx.x & 0x7 : threadIdx.x >> 3;
|
||||
uint32_t row_id = stride ? threadIdx.x >> 3 : threadIdx.x & 0x7;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
if (store) {
|
||||
shmem[ntt_id * 64 + row_id * 8 + i] = X[i];
|
||||
} else {
|
||||
X[i] = shmem[ntt_id * 64 + row_id * 8 + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void SharedData32Columns8(E* shmem, bool store, bool high_bits, bool stride)
|
||||
{
|
||||
uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
|
||||
uint32_t column_id = stride ? threadIdx.x >> 4 : threadIdx.x & 0x3;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
if (store) {
|
||||
shmem[ntt_id * 32 + i * 4 + column_id] = X[i];
|
||||
} else {
|
||||
X[i] = shmem[ntt_id * 32 + i * 4 + column_id];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void SharedData32Rows8(E* shmem, bool store, bool high_bits, bool stride)
|
||||
{
|
||||
uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
|
||||
uint32_t row_id = stride ? threadIdx.x >> 4 : threadIdx.x & 0x3;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
if (store) {
|
||||
shmem[ntt_id * 32 + row_id * 8 + i] = X[i];
|
||||
} else {
|
||||
X[i] = shmem[ntt_id * 32 + row_id * 8 + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void SharedData32Columns4_2(E* shmem, bool store, bool high_bits, bool stride)
|
||||
{
|
||||
uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
|
||||
uint32_t column_id = (stride ? threadIdx.x >> 4 : threadIdx.x & 0x3) * 2;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 2; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 4; i++) {
|
||||
if (store) {
|
||||
shmem[ntt_id * 32 + i * 8 + column_id + j] = X[4 * j + i];
|
||||
} else {
|
||||
X[4 * j + i] = shmem[ntt_id * 32 + i * 8 + column_id + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void SharedData32Rows4_2(E* shmem, bool store, bool high_bits, bool stride)
|
||||
{
|
||||
uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
|
||||
uint32_t row_id = (stride ? threadIdx.x >> 4 : threadIdx.x & 0x3) * 2;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 2; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 4; i++) {
|
||||
if (store) {
|
||||
shmem[ntt_id * 32 + row_id * 4 + 4 * j + i] = X[4 * j + i];
|
||||
} else {
|
||||
X[4 * j + i] = shmem[ntt_id * 32 + row_id * 4 + 4 * j + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void SharedData16Columns8(E* shmem, bool store, bool high_bits, bool stride)
|
||||
{
|
||||
uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
|
||||
uint32_t column_id = stride ? threadIdx.x >> 5 : threadIdx.x & 0x1;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
if (store) {
|
||||
shmem[ntt_id * 16 + i * 2 + column_id] = X[i];
|
||||
} else {
|
||||
X[i] = shmem[ntt_id * 16 + i * 2 + column_id];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void SharedData16Rows8(E* shmem, bool store, bool high_bits, bool stride)
|
||||
{
|
||||
uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
|
||||
uint32_t row_id = stride ? threadIdx.x >> 5 : threadIdx.x & 0x1;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
if (store) {
|
||||
shmem[ntt_id * 16 + row_id * 8 + i] = X[i];
|
||||
} else {
|
||||
X[i] = shmem[ntt_id * 16 + row_id * 8 + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void SharedData16Columns2_4(E* shmem, bool store, bool high_bits, bool stride)
|
||||
{
|
||||
uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
|
||||
uint32_t column_id = (stride ? threadIdx.x >> 5 : threadIdx.x & 0x1) * 4;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 2; i++) {
|
||||
if (store) {
|
||||
shmem[ntt_id * 16 + i * 8 + column_id + j] = X[2 * j + i];
|
||||
} else {
|
||||
X[2 * j + i] = shmem[ntt_id * 16 + i * 8 + column_id + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void SharedData16Rows2_4(E* shmem, bool store, bool high_bits, bool stride)
|
||||
{
|
||||
uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
|
||||
uint32_t row_id = (stride ? threadIdx.x >> 5 : threadIdx.x & 0x1) * 4;
|
||||
|
||||
UNROLL
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
UNROLL
|
||||
for (uint32_t i = 0; i < 2; i++) {
|
||||
if (store) {
|
||||
shmem[ntt_id * 16 + row_id * 2 + 2 * j + i] = X[2 * j + i];
|
||||
} else {
|
||||
X[2 * j + i] = shmem[ntt_id * 16 + row_id * 2 + 2 * j + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void twiddlesInternal()
|
||||
{
|
||||
UNROLL
|
||||
for (int i = 1; i < 8; i++) {
|
||||
X[i] = X[i] * WI[i - 1];
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_INLINE void twiddlesExternal()
|
||||
{
|
||||
UNROLL
|
||||
for (int i = 0; i < 8; i++) {
|
||||
X[i] = X[i] * WE[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
27
icicle/src/polynomials/CMakeLists.txt
Normal file
27
icicle/src/polynomials/CMakeLists.txt
Normal file
@@ -0,0 +1,27 @@
|
||||
set(TARGET icicle_poly)
|
||||
set(CURVE_TARGET icicle_curve)
|
||||
set(FIELD_TARGET icicle_field)
|
||||
|
||||
set(SRC ../)
|
||||
|
||||
set(POLY_SOURCE ${SRC}/polynomials/polynomials.cpp)
|
||||
set(POLY_API_SOURCE ${SRC}/polynomials/polynomials_c_api.cpp)
|
||||
# if(MSM)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
|
||||
# if(G2)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
|
||||
# endif()
|
||||
# endif()
|
||||
# if(ECNTT)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
|
||||
# endif()
|
||||
|
||||
add_library(${TARGET} STATIC ${POLY_SOURCE})
|
||||
add_library(${TARGET} STATIC ${POLY_API_SOURCE})
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
|
||||
target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
|
||||
target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
|
||||
target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
|
||||
target_compile_features(${TARGET} PUBLIC cxx_std_17)
|
||||
@@ -39,7 +39,7 @@ namespace polynomials {
|
||||
|
||||
/*============================== evaluate ==============================*/
|
||||
template <typename T>
|
||||
__device__ T pow(T base, int exp)
|
||||
T pow(T base, int exp)
|
||||
{
|
||||
T result = T::one();
|
||||
while (exp > 0) {
|
||||
|
||||
204
icicle/src/polynomials/polynomials.cpp
Normal file
204
icicle/src/polynomials/polynomials.cpp
Normal file
@@ -0,0 +1,204 @@
|
||||
#define FIELD_ID BN254
|
||||
#include "../../include/polynomials/polynomials.h"
|
||||
namespace polynomials {
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I>::Polynomial()
|
||||
{
|
||||
if (nullptr == s_factory) {
|
||||
throw std::runtime_error("Polynomial factory not initialized. Must call Polynomial::initialize(factory)");
|
||||
}
|
||||
m_context = s_factory->create_context();
|
||||
m_backend = s_factory->create_backend();
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I> Polynomial<C, D, I>::from_coefficients(const C* coefficients, uint64_t nof_coefficients)
|
||||
{
|
||||
Polynomial<C, D, I> P = {};
|
||||
P.m_backend->from_coefficients(P.m_context, nof_coefficients, coefficients);
|
||||
return P;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I> Polynomial<C, D, I>::from_rou_evaluations(const I* evaluations, uint64_t nof_evaluations)
|
||||
{
|
||||
Polynomial<C, D, I> P = {};
|
||||
P.m_backend->from_rou_evaluations(P.m_context, nof_evaluations, evaluations);
|
||||
return P;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I> Polynomial<C, D, I>::clone() const
|
||||
{
|
||||
Polynomial<C, D, I> P = {};
|
||||
m_backend->clone(P.m_context, m_context);
|
||||
return P;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I> Polynomial<C, D, I>::slice(uint64_t offset, uint64_t stride, uint64_t size)
|
||||
{
|
||||
Polynomial res = {};
|
||||
m_backend->slice(res.m_context, this->m_context, offset, stride, size);
|
||||
return res;
|
||||
}
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I> Polynomial<C, D, I>::even()
|
||||
{
|
||||
return slice(0, 2, 0 /*all elements*/);
|
||||
}
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I> Polynomial<C, D, I>::odd()
|
||||
{
|
||||
return slice(1, 2, 0 /*all elements*/);
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I> Polynomial<C, D, I>::operator+(const Polynomial<C, D, I>& rhs) const
|
||||
{
|
||||
Polynomial<C, D, I> res = {};
|
||||
m_backend->add(res.m_context, m_context, rhs.m_context);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I> Polynomial<C, D, I>::operator-(const Polynomial<C, D, I>& rhs) const
|
||||
{
|
||||
Polynomial<C, D, I> res = {};
|
||||
m_backend->subtract(res.m_context, m_context, rhs.m_context);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I> Polynomial<C, D, I>::operator*(const Polynomial& rhs) const
|
||||
{
|
||||
Polynomial<C, D, I> res = {};
|
||||
m_backend->multiply(res.m_context, m_context, rhs.m_context);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I> Polynomial<C, D, I>::operator*(const D& scalar) const
|
||||
{
|
||||
Polynomial<C, D, I> res = {};
|
||||
m_backend->multiply(res.m_context, m_context, scalar);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I> operator*(const D& scalar, const Polynomial<C, D, I>& rhs)
|
||||
{
|
||||
return rhs * scalar;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
std::pair<Polynomial<C, D, I>, Polynomial<C, D, I>> Polynomial<C, D, I>::divide(const Polynomial<C, D, I>& rhs) const
|
||||
{
|
||||
Polynomial<C, D, I> Q = {}, R = {};
|
||||
m_backend->divide(Q.m_context, R.m_context, m_context, rhs.m_context);
|
||||
return std::make_pair(std::move(Q), std::move(R));
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I> Polynomial<C, D, I>::operator/(const Polynomial& rhs) const
|
||||
{
|
||||
Polynomial<C, D, I> res = {};
|
||||
m_backend->quotient(res.m_context, m_context, rhs.m_context);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I> Polynomial<C, D, I>::operator%(const Polynomial& rhs) const
|
||||
{
|
||||
Polynomial<C, D, I> res = {};
|
||||
m_backend->remainder(res.m_context, m_context, rhs.m_context);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I> Polynomial<C, D, I>::divide_by_vanishing_polynomial(uint64_t vanishing_polynomial_degree) const
|
||||
{
|
||||
Polynomial<C, D, I> res = {};
|
||||
m_backend->divide_by_vanishing_polynomial(res.m_context, m_context, vanishing_polynomial_degree);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I>& Polynomial<C, D, I>::operator+=(const Polynomial& rhs)
|
||||
{
|
||||
m_backend->add(m_context, m_context, rhs.m_context);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I>& Polynomial<C, D, I>::add_monomial_inplace(C monomial_coeff, uint64_t monomial)
|
||||
{
|
||||
m_backend->add_monomial_inplace(m_context, monomial_coeff, monomial);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
Polynomial<C, D, I>& Polynomial<C, D, I>::sub_monomial_inplace(C monomial_coeff, uint64_t monomial)
|
||||
{
|
||||
m_backend->sub_monomial_inplace(m_context, monomial_coeff, monomial);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
I Polynomial<C, D, I>::operator()(const D& x) const
|
||||
{
|
||||
I eval = {};
|
||||
evaluate(&x, &eval);
|
||||
return eval;
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
void Polynomial<C, D, I>::evaluate(const D* x, I* eval) const
|
||||
{
|
||||
m_backend->evaluate(m_context, x, eval);
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
void Polynomial<C, D, I>::evaluate_on_domain(D* domain, uint64_t size, I* evals /*OUT*/) const
|
||||
{
|
||||
return m_backend->evaluate_on_domain(m_context, domain, size, evals);
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
int64_t Polynomial<C, D, I>::degree()
|
||||
{
|
||||
return m_backend->degree(m_context);
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
C Polynomial<C, D, I>::get_coeff(uint64_t idx) const
|
||||
{
|
||||
return m_backend->get_coeff(m_context, idx);
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
uint64_t Polynomial<C, D, I>::copy_coeffs(C* host_coeffs, uint64_t start_idx, uint64_t end_idx) const
|
||||
{
|
||||
return m_backend->copy_coeffs(m_context, host_coeffs, start_idx, end_idx);
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/>
|
||||
Polynomial<C, D, I>::get_coefficients_view()
|
||||
{
|
||||
return m_backend->get_coefficients_view(m_context);
|
||||
}
|
||||
|
||||
template <typename C, typename D, typename I>
|
||||
std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
|
||||
Polynomial<C, D, I>::get_rou_evaluations_view(uint64_t nof_evaluations, bool is_reversed)
|
||||
{
|
||||
return m_backend->get_rou_evaluations_view(m_context, nof_evaluations, is_reversed);
|
||||
}
|
||||
|
||||
// explicit instantiation for default type (scalar field)
|
||||
template class Polynomial<scalar_t>;
|
||||
template Polynomial<scalar_t> operator*(const scalar_t& c, const Polynomial<scalar_t>& rhs);
|
||||
|
||||
} // namespace polynomials
|
||||
284
icicle/src/polynomials/polynomials_c_api.cpp
Normal file
284
icicle/src/polynomials/polynomials_c_api.cpp
Normal file
@@ -0,0 +1,284 @@
|
||||
#define FIELD_ID BN254
|
||||
#include "../../include/polynomials/polynomials.h"
|
||||
#include "../../include/fields/field_config.cuh"
|
||||
#include "../../include/utils/utils.h"
|
||||
#include "../../include/utils/integrity_pointer.h"
|
||||
#include "../../include/polynomials/cuda_backend/polynomial_cuda_backend.cuh"
|
||||
|
||||
namespace polynomials {
|
||||
extern "C" {
|
||||
|
||||
// Defines a polynomial instance based on the scalar type from the FIELD configuration.
|
||||
typedef Polynomial<scalar_t> PolynomialInst;
|
||||
|
||||
bool CONCAT_EXPAND(FIELD, polynomial_init_cuda_backend)()
|
||||
{
|
||||
static auto cuda_factory = std::make_shared<CUDAPolynomialFactory<scalar_t>>();
|
||||
PolynomialInst::initialize(cuda_factory);
|
||||
return cuda_factory != nullptr;
|
||||
}
|
||||
|
||||
// Constructs a polynomial from a set of coefficients.
|
||||
// coeffs: Array of coefficients.
|
||||
// size: Number of coefficients in the array.
|
||||
// Returns a pointer to the newly created polynomial instance.
|
||||
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_create_from_coefficients)(scalar_t* coeffs, size_t size)
|
||||
{
|
||||
auto result = new PolynomialInst(PolynomialInst::from_coefficients(coeffs, size));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Constructs a polynomial from evaluations at the roots of unity.
|
||||
// evals: Array of evaluations.
|
||||
// size: Number of evaluations in the array.
|
||||
// Returns a pointer to the newly created polynomial instance.
|
||||
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_create_from_rou_evaluations)(scalar_t* evals, size_t size)
|
||||
{
|
||||
auto result = new PolynomialInst(PolynomialInst::from_rou_evaluations(evals, size));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Clones an existing polynomial instance.
|
||||
// p: Pointer to the polynomial instance to clone.
|
||||
// Returns a pointer to the cloned polynomial instance.
|
||||
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_clone)(const PolynomialInst* p)
|
||||
{
|
||||
auto result = new PolynomialInst(p->clone());
|
||||
return result;
|
||||
}
|
||||
|
||||
// Deletes a polynomial instance, freeing its memory.
|
||||
// instance: Pointer to the polynomial instance to delete.
|
||||
void CONCAT_EXPAND(FIELD, polynomial_delete)(PolynomialInst* instance) { delete instance; }
|
||||
|
||||
// Prints a polynomial to stdout
|
||||
void CONCAT_EXPAND(FIELD, polynomial_print(PolynomialInst* p)) { std::cout << *p << std::endl; }
|
||||
|
||||
// Adds two polynomials.
|
||||
// a, b: Pointers to the polynomial instances to add.
|
||||
// Returns a pointer to the resulting polynomial instance.
|
||||
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_add)(const PolynomialInst* a, const PolynomialInst* b)
|
||||
{
|
||||
auto result = new PolynomialInst(std::move(*a + *b));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Adds a polynomial to another in place.
|
||||
// a: Pointer to the polynomial to add to.
|
||||
// b: Pointer to the polynomial to add.
|
||||
void CONCAT_EXPAND(FIELD, polynomial_add_inplace)(PolynomialInst* a, const PolynomialInst* b) { *a += *b; }
|
||||
|
||||
// Subtracts one polynomial from another.
|
||||
// a, b: Pointers to the polynomial instances (minuend and subtrahend, respectively).
|
||||
// Returns a pointer to the resulting polynomial instance.
|
||||
|
||||
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_subtract)(const PolynomialInst* a, const PolynomialInst* b)
|
||||
{
|
||||
auto result = new PolynomialInst(std::move(*a - *b));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Multiplies two polynomials.
|
||||
// a, b: Pointers to the polynomial instances to multiply.
|
||||
// Returns a pointer to the resulting polynomial instance.
|
||||
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_multiply)(const PolynomialInst* a, const PolynomialInst* b)
|
||||
{
|
||||
auto result = new PolynomialInst(std::move(*a * *b));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Multiplies a polynomial by scalar.
|
||||
// a: Pointer to the polynomial instance.
|
||||
// scalar: Scalar to multiply by.
|
||||
// Returns a pointer to the resulting polynomial instance.
|
||||
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_multiply_by_scalar)(const PolynomialInst* a, const scalar_t& scalar)
|
||||
{
|
||||
auto result = new PolynomialInst(std::move(*a * scalar));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Divides one polynomial by another, returning both quotient and remainder.
|
||||
// a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
|
||||
// q: Output parameter for the quotient.
|
||||
// r: Output parameter for the remainder.
|
||||
void CONCAT_EXPAND(FIELD, polynomial_division)(
|
||||
const PolynomialInst* a, const PolynomialInst* b, PolynomialInst** q /*OUT*/, PolynomialInst** r /*OUT*/)
|
||||
{
|
||||
auto [_q, _r] = a->divide(*b);
|
||||
*q = new PolynomialInst(std::move(_q));
|
||||
*r = new PolynomialInst(std::move(_r));
|
||||
}
|
||||
|
||||
// Calculates the quotient of dividing one polynomial by another.
|
||||
// a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
|
||||
// Returns a pointer to the resulting quotient polynomial instance.
|
||||
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_quotient)(const PolynomialInst* a, const PolynomialInst* b)
|
||||
{
|
||||
auto result = new PolynomialInst(std::move(*a / *b));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Calculates the remainder of dividing one polynomial by another.
|
||||
// a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
|
||||
// Returns a pointer to the resulting remainder polynomial instance.
|
||||
|
||||
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_remainder)(const PolynomialInst* a, const PolynomialInst* b)
|
||||
{
|
||||
auto result = new PolynomialInst(std::move(*a % *b));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Divides a polynomial by a vanishing polynomial of a given degree, over rou domain.
|
||||
// p: Pointer to the polynomial instance.
|
||||
// vanishing_poly_degree: Degree of the vanishing polynomial.
|
||||
// Returns a pointer to the resulting polynomial instance.
|
||||
PolynomialInst*
|
||||
CONCAT_EXPAND(FIELD, polynomial_divide_by_vanishing)(const PolynomialInst* p, uint64_t vanishing_poly_degree)
|
||||
{
|
||||
auto result = new PolynomialInst(std::move(p->divide_by_vanishing_polynomial(vanishing_poly_degree)));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Adds a monomial to a polynomial in place.
|
||||
// p: Pointer to the polynomial instance.
|
||||
// monomial_coeff: Coefficient of the monomial to add.
|
||||
// monomial: Degree of the monomial to add.
|
||||
void CONCAT_EXPAND(FIELD, polynomial_add_monomial_inplace)(
|
||||
PolynomialInst* p, const scalar_t& monomial_coeff, uint64_t monomial)
|
||||
{
|
||||
p->add_monomial_inplace(monomial_coeff, monomial);
|
||||
}
|
||||
|
||||
// Subtracts a monomial from a polynomial in place.
|
||||
// p: Pointer to the polynomial instance.
|
||||
// monomial_coeff: Coefficient of the monomial to subtract.
|
||||
// monomial: Degree of the monomial to subtract.
|
||||
void CONCAT_EXPAND(FIELD, polynomial_sub_monomial_inplace)(
|
||||
PolynomialInst* p, const scalar_t& monomial_coeff, uint64_t monomial)
|
||||
{
|
||||
p->sub_monomial_inplace(monomial_coeff, monomial);
|
||||
}
|
||||
|
||||
// Creates a new polynomial instance by slicing an existing polynomial.
|
||||
// p: Pointer to the original polynomial instance to be sliced.
|
||||
// offset: Starting index for the slice.
|
||||
// stride: Interval between elements in the slice.
|
||||
// size: Number of elements in the slice.
|
||||
// Returns: Pointer to the new polynomial instance containing the slice.
|
||||
PolynomialInst*
|
||||
CONCAT_EXPAND(FIELD, polynomial_slice)(PolynomialInst* p, uint64_t offset, uint64_t stride, uint64_t size)
|
||||
{
|
||||
auto result = new PolynomialInst(std::move(p->slice(offset, stride, size)));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Creates a new polynomial instance containing only the even-powered terms of the original polynomial.
|
||||
// p: Pointer to the original polynomial instance.
|
||||
// Returns: Pointer to the new polynomial instance containing only even-powered terms.
|
||||
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_even)(PolynomialInst* p)
|
||||
{
|
||||
auto result = new PolynomialInst(std::move(p->even()));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Creates a new polynomial instance containing only the odd-powered terms of the original polynomial.
|
||||
// p: Pointer to the original polynomial instance.
|
||||
// Returns: Pointer to the new polynomial instance containing only odd-powered terms.
|
||||
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_odd)(PolynomialInst* p)
|
||||
{
|
||||
auto result = new PolynomialInst(std::move(p->odd()));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Evaluates a polynomial on a domain of points.
|
||||
// p: Pointer to the polynomial instance.
|
||||
// domain: Array of points constituting the domain.
|
||||
// domain_size: Number of points in the domain.
|
||||
// evals: Output array for the evaluations.
|
||||
void CONCAT_EXPAND(FIELD, polynomial_evaluate_on_domain)(
|
||||
const PolynomialInst* p, scalar_t* domain, uint64_t domain_size, scalar_t* evals /*OUT*/)
|
||||
{
|
||||
return p->evaluate_on_domain(domain, domain_size, evals);
|
||||
}
|
||||
|
||||
// Returns the degree of a polynomial.
|
||||
// p: Pointer to the polynomial instance.
|
||||
// Returns the degree of the polynomial.
|
||||
int64_t CONCAT_EXPAND(FIELD, polynomial_degree)(PolynomialInst* p) { return p->degree(); }
|
||||
|
||||
// Copies a range of polynomial coefficients to host/device memory.
|
||||
// p: Pointer to the polynomial instance.
|
||||
// host_memory: Array to copy the coefficients into. If NULL, not copying.
|
||||
// start_idx: Start index of the range to copy.
|
||||
// end_idx: End index of the range to copy.
|
||||
// Returns the number of coefficients copied. if memory is NULL, returns number of coefficients.
|
||||
uint64_t CONCAT_EXPAND(FIELD, polynomial_copy_coeffs_range)(
|
||||
PolynomialInst* p, scalar_t* memory, uint64_t start_idx, uint64_t end_idx)
|
||||
{
|
||||
return p->copy_coeffs(memory, start_idx, end_idx);
|
||||
}
|
||||
|
||||
// Retrieves a device-memory raw-ptr of the polynomial coefficients.
|
||||
// p: Pointer to the polynomial instance.
|
||||
// size: Output parameter for the size of the view.
|
||||
// device_id: Output parameter for the device ID.
|
||||
// Returns a raw mutable pointer to the coefficients.
|
||||
scalar_t* CONCAT_EXPAND(FIELD, polynomial_get_coeffs_raw_ptr)(
|
||||
PolynomialInst* p, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
|
||||
{
|
||||
auto [coeffs, _size, _device_id] = p->get_coefficients_view();
|
||||
*size = _size;
|
||||
*device_id = _device_id;
|
||||
return const_cast<scalar_t*>(coeffs.get());
|
||||
}
|
||||
|
||||
// Retrieves a device-memory view of the polynomial coefficients.
|
||||
// p: Pointer to the polynomial instance.
|
||||
// size: Output parameter for the size of the view.
|
||||
// device_id: Output parameter for the device ID.
|
||||
// Returns a pointer to an integrity pointer encapsulating the coefficients view.
|
||||
IntegrityPointer<scalar_t>* CONCAT_EXPAND(FIELD, polynomial_get_coeff_view)(
|
||||
PolynomialInst* p, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
|
||||
{
|
||||
auto [coeffs, _size, _device_id] = p->get_coefficients_view();
|
||||
*size = _size;
|
||||
*device_id = _device_id;
|
||||
return new IntegrityPointer<scalar_t>(std::move(coeffs));
|
||||
}
|
||||
|
||||
// Retrieves a device-memory view of the polynomial's evaluations on the roots of unity.
|
||||
// p: Pointer to the polynomial instance.
|
||||
// nof_evals: Number of evaluations.
|
||||
// is_reversed: Whether the evaluations are in reversed order.
|
||||
// size: Output parameter for the size of the view.
|
||||
// device_id: Output parameter for the device ID.
|
||||
// Returns a pointer to an integrity pointer encapsulating the evaluations view.
|
||||
IntegrityPointer<scalar_t>* CONCAT_EXPAND(FIELD, polynomial_get_rou_evaluations_view)(
|
||||
PolynomialInst* p, uint64_t nof_evals, bool is_reversed, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
|
||||
{
|
||||
auto [rou_evals, _size, _device_id] = p->get_rou_evaluations_view(nof_evals, is_reversed);
|
||||
*size = _size;
|
||||
*device_id = _device_id;
|
||||
return new IntegrityPointer<scalar_t>(std::move(rou_evals));
|
||||
}
|
||||
|
||||
// Reads the pointer from an integrity pointer.
|
||||
// p: Pointer to the integrity pointer.
|
||||
// Returns the raw pointer if still valid, otherwise NULL.
|
||||
const scalar_t* CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_get)(IntegrityPointer<scalar_t>* p)
|
||||
{
|
||||
return p->get();
|
||||
}
|
||||
|
||||
// Checks if an integrity pointer is still valid.
|
||||
// p: Pointer to the integrity pointer.
|
||||
// Returns true if the pointer is valid, false otherwise.
|
||||
bool CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_is_valid)(IntegrityPointer<scalar_t>* p) { return p->isValid(); }
|
||||
|
||||
// Destroys an integrity pointer, freeing its resources.
|
||||
// p: Pointer to the integrity pointer to destroy.
|
||||
void CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_destroy)(IntegrityPointer<scalar_t>* p) { delete p; }
|
||||
|
||||
} // extern "C"
|
||||
|
||||
} // namespace polynomials
|
||||
25
icicle/src/poseidon/CMakeLists.txt
Normal file
25
icicle/src/poseidon/CMakeLists.txt
Normal file
@@ -0,0 +1,25 @@
|
||||
set(TARGET icicle_poseidon)
|
||||
set(CURVE_TARGET icicle_curve)
|
||||
set(FIELD_TARGET icicle_field)
|
||||
|
||||
set(SRC ../)
|
||||
|
||||
set(POLY_SOURCE ${SRC}/poseidon/poseidon.cpp)
|
||||
# if(MSM)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
|
||||
# if(G2)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
|
||||
# endif()
|
||||
# endif()
|
||||
# if(ECNTT)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
|
||||
# endif()
|
||||
|
||||
add_library(${TARGET} STATIC ${POLY_SOURCE})
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
|
||||
target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
|
||||
target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
|
||||
target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
|
||||
target_compile_features(${TARGET} PUBLIC cxx_std_17)
|
||||
@@ -1,21 +1,21 @@
|
||||
#include "poseidon/poseidon.cuh"
|
||||
#include "../../include/poseidon/poseidon.cuh"
|
||||
|
||||
/// These are pre-calculated constants for different curves
|
||||
#include "fields/id.h"
|
||||
#include "../../include/fields/id.h"
|
||||
#if FIELD_ID == BN254
|
||||
#include "poseidon/constants/bn254_poseidon.h"
|
||||
#include "../../include/poseidon/constants/bn254_poseidon.h"
|
||||
using namespace poseidon_constants_bn254;
|
||||
#elif FIELD_ID == BLS12_381
|
||||
#include "poseidon/constants/bls12_381_poseidon.h"
|
||||
#include "../../include/poseidon/constants/bls12_381_poseidon.h"
|
||||
using namespace poseidon_constants_bls12_381;
|
||||
#elif FIELD_ID == BLS12_377
|
||||
#include "poseidon/constants/bls12_377_poseidon.h"
|
||||
#include "../../include/poseidon/constants/bls12_377_poseidon.h"
|
||||
using namespace poseidon_constants_bls12_377;
|
||||
#elif FIELD_ID == BW6_761
|
||||
#include "poseidon/constants/bw6_761_poseidon.h"
|
||||
#include "../../include/poseidon/constants/bw6_761_poseidon.h"
|
||||
using namespace poseidon_constants_bw6_761;
|
||||
#elif FIELD_ID == GRUMPKIN
|
||||
#include "poseidon/constants/grumpkin_poseidon.h"
|
||||
#include "../../include/poseidon/constants/grumpkin_poseidon.h"
|
||||
using namespace poseidon_constants_grumpkin;
|
||||
#endif
|
||||
|
||||
@@ -29,8 +29,8 @@ namespace poseidon {
|
||||
device_context::DeviceContext& ctx,
|
||||
PoseidonConstants<S>* poseidon_constants)
|
||||
{
|
||||
CHK_INIT_IF_RETURN();
|
||||
cudaStream_t& stream = ctx.stream;
|
||||
// CHK_INIT_IF_RETURN();
|
||||
int& stream = ctx.stream;
|
||||
int width = arity + 1;
|
||||
int round_constants_len = width * full_rounds_half * 2 + partial_rounds;
|
||||
int mds_matrix_len = width * width;
|
||||
@@ -39,10 +39,10 @@ namespace poseidon {
|
||||
|
||||
// Malloc memory for copying constants
|
||||
S* d_constants;
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_constants, sizeof(S) * constants_len, stream));
|
||||
// CHK_IF_RETURN(cudaMallocAsync(&d_constants, sizeof(S) * constants_len, stream));
|
||||
|
||||
// Copy constants
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(d_constants, constants, sizeof(S) * constants_len, cudaMemcpyHostToDevice, stream));
|
||||
// CHK_IF_RETURN(cudaMemcpyAsync(d_constants, constants, sizeof(S) * constants_len, cudaMemcpyHostToDevice, stream));
|
||||
|
||||
S* round_constants = d_constants;
|
||||
S* mds_matrix = round_constants + round_constants_len;
|
||||
@@ -56,18 +56,18 @@ namespace poseidon {
|
||||
S domain_tag = S::from(tree_domain_tag_value);
|
||||
|
||||
// Make sure all the constants have been copied
|
||||
CHK_IF_RETURN(cudaStreamSynchronize(stream));
|
||||
// CHK_IF_RETURN(cudaStreamSynchronize(stream));
|
||||
*poseidon_constants = {arity, partial_rounds, full_rounds_half, round_constants,
|
||||
mds_matrix, non_sparse_matrix, sparse_matrices, domain_tag};
|
||||
|
||||
return CHK_LAST();
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
cudaError_t init_optimized_poseidon_constants(
|
||||
int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* poseidon_constants)
|
||||
{
|
||||
CHK_INIT_IF_RETURN();
|
||||
//CHK_INIT_IF_RETURN();
|
||||
int full_rounds_half = FULL_ROUNDS_DEFAULT;
|
||||
int partial_rounds;
|
||||
unsigned char* constants;
|
||||
@@ -96,7 +96,7 @@ namespace poseidon {
|
||||
|
||||
create_optimized_poseidon_constants(arity, full_rounds_half, partial_rounds, h_constants, ctx, poseidon_constants);
|
||||
|
||||
return CHK_LAST();
|
||||
return 0; //CHK_LAST();
|
||||
}
|
||||
|
||||
extern "C" cudaError_t CONCAT_EXPAND(FIELD, create_optimized_poseidon_constants_cuda)(
|
||||
|
||||
@@ -1,38 +1,15 @@
|
||||
#include "poseidon/poseidon.cuh"
|
||||
#include "gpu-utils/modifiers.cuh"
|
||||
#include "../../include/poseidon/poseidon.cuh"
|
||||
#include "../../include/gpu-utils/modifiers.cuh"
|
||||
|
||||
namespace poseidon {
|
||||
template <typename S, int T>
|
||||
__global__ void prepare_poseidon_states(S* states, size_t number_of_states, S domain_tag, bool aligned)
|
||||
void prepare_poseidon_states(S* states, size_t number_of_states, S domain_tag, bool aligned)
|
||||
{
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int state_number = idx / T;
|
||||
if (state_number >= number_of_states) { return; }
|
||||
int element_number = idx % T;
|
||||
|
||||
S prepared_element;
|
||||
|
||||
// Domain separation
|
||||
if (element_number == 0) {
|
||||
prepared_element = domain_tag;
|
||||
} else {
|
||||
if (aligned) {
|
||||
prepared_element = states[idx];
|
||||
} else {
|
||||
prepared_element = states[idx - 1];
|
||||
}
|
||||
}
|
||||
|
||||
// We need __syncthreads here if the state is not aligned
|
||||
// because then we need to shift the vector [A, B, 0] -> [D, A, B]
|
||||
if (!aligned) { __syncthreads(); }
|
||||
|
||||
// Store element in state
|
||||
states[idx] = prepared_element;
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
DEVICE_INLINE S sbox_alpha_five(S element)
|
||||
S sbox_alpha_five(S element)
|
||||
{
|
||||
S result = S::sqr(element);
|
||||
result = S::sqr(result);
|
||||
@@ -40,23 +17,13 @@ namespace poseidon {
|
||||
}
|
||||
|
||||
template <typename S, int T>
|
||||
__device__ S vecs_mul_matrix(S element, S* matrix, int element_number, int vec_number, S* shared_states)
|
||||
S vecs_mul_matrix(S element, S* matrix, int element_number, int vec_number, S* shared_states)
|
||||
{
|
||||
__syncthreads();
|
||||
shared_states[threadIdx.x] = element;
|
||||
__syncthreads();
|
||||
|
||||
typename S::Wide element_wide = S::mul_wide(shared_states[vec_number * T], matrix[element_number]);
|
||||
UNROLL
|
||||
for (int i = 1; i < T; i++) {
|
||||
element_wide = element_wide + S::mul_wide(shared_states[vec_number * T + i], matrix[i * T + element_number]);
|
||||
}
|
||||
|
||||
return S::reduce(element_wide);
|
||||
return element;
|
||||
}
|
||||
|
||||
template <typename S, int T>
|
||||
__device__ S full_round(
|
||||
S full_round(
|
||||
S element,
|
||||
size_t rc_offset,
|
||||
int local_state_number,
|
||||
@@ -67,110 +34,40 @@ namespace poseidon {
|
||||
S* shared_states,
|
||||
const PoseidonConstants<S>& constants)
|
||||
{
|
||||
if (add_pre_round_constants) {
|
||||
element = element + constants.round_constants[rc_offset + element_number];
|
||||
rc_offset += T;
|
||||
}
|
||||
element = sbox_alpha_five(element);
|
||||
if (!skip_rc) { element = element + constants.round_constants[rc_offset + element_number]; }
|
||||
|
||||
// Multiply all the states by mds matrix
|
||||
S* matrix = multiply_by_mds ? constants.mds_matrix : constants.non_sparse_matrix;
|
||||
return vecs_mul_matrix<S, T>(element, matrix, element_number, local_state_number, shared_states);
|
||||
return element;
|
||||
}
|
||||
|
||||
template <typename S, int T>
|
||||
__global__ void full_rounds(
|
||||
void full_rounds(
|
||||
S* states, size_t number_of_states, size_t rc_offset, bool first_half, const PoseidonConstants<S> constants)
|
||||
{
|
||||
extern __shared__ S shared_states[];
|
||||
|
||||
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
int state_number = idx / T;
|
||||
if (state_number >= number_of_states) { return; }
|
||||
int local_state_number = threadIdx.x / T;
|
||||
int element_number = idx % T;
|
||||
|
||||
S new_el = states[idx];
|
||||
bool add_pre_round_constants = first_half;
|
||||
for (int i = 0; i < constants.full_rounds_half; i++) {
|
||||
new_el = full_round<S, T>(
|
||||
new_el, rc_offset, local_state_number, element_number, !first_half || (i < (constants.full_rounds_half - 1)),
|
||||
add_pre_round_constants, !first_half && (i == constants.full_rounds_half - 1), shared_states, constants);
|
||||
rc_offset += T;
|
||||
|
||||
if (add_pre_round_constants) {
|
||||
rc_offset += T;
|
||||
add_pre_round_constants = false;
|
||||
}
|
||||
}
|
||||
states[idx] = new_el;
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename S, int T>
|
||||
__device__ S partial_round(S state[T], size_t rc_offset, int round_number, const PoseidonConstants<S>& constants)
|
||||
S partial_round(S state[T], size_t rc_offset, int round_number, const PoseidonConstants<S>& constants)
|
||||
{
|
||||
S element = state[0];
|
||||
element = sbox_alpha_five(element);
|
||||
element = element + constants.round_constants[rc_offset];
|
||||
|
||||
S* sparse_matrix = &constants.sparse_matrices[(T * 2 - 1) * round_number];
|
||||
|
||||
typename S::Wide state_0_wide = S::mul_wide(element, sparse_matrix[0]);
|
||||
|
||||
UNROLL
|
||||
for (int i = 1; i < T; i++) {
|
||||
state_0_wide = state_0_wide + S::mul_wide(state[i], sparse_matrix[i]);
|
||||
}
|
||||
|
||||
state[0] = S::reduce(state_0_wide);
|
||||
|
||||
UNROLL
|
||||
for (int i = 1; i < T; i++) {
|
||||
state[i] = state[i] + (element * sparse_matrix[T + i - 1]);
|
||||
}
|
||||
return element;
|
||||
}
|
||||
|
||||
template <typename S, int T>
|
||||
__global__ void
|
||||
void
|
||||
partial_rounds(S* states, size_t number_of_states, size_t rc_offset, const PoseidonConstants<S> constants)
|
||||
{
|
||||
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if (idx >= number_of_states) { return; }
|
||||
|
||||
S state[T];
|
||||
UNROLL
|
||||
for (int i = 0; i < T; i++) {
|
||||
state[i] = states[idx * T + i];
|
||||
}
|
||||
|
||||
for (int i = 0; i < constants.partial_rounds; i++) {
|
||||
partial_round<S, T>(state, rc_offset, i, constants);
|
||||
rc_offset++;
|
||||
}
|
||||
|
||||
UNROLL
|
||||
for (int i = 0; i < T; i++) {
|
||||
states[idx * T + i] = state[i];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// These function is just doing copy from the states to the output
|
||||
template <typename S, int T>
|
||||
__global__ void get_hash_results(S* states, size_t number_of_states, S* out)
|
||||
void get_hash_results(S* states, size_t number_of_states, S* out)
|
||||
{
|
||||
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if (idx >= number_of_states) { return; }
|
||||
|
||||
out[idx] = states[idx * T + 1];
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename S, int T>
|
||||
__global__ void copy_recursive(S* state, size_t number_of_states, S* out)
|
||||
void copy_recursive(S* state, size_t number_of_states, S* out)
|
||||
{
|
||||
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if (idx >= number_of_states) { return; }
|
||||
|
||||
state[(idx / (T - 1) * T) + (idx % (T - 1)) + 1] = out[idx];
|
||||
return;
|
||||
}
|
||||
} // namespace poseidon
|
||||
47
icicle/src/poseidon/poseidon.cpp
Normal file
47
icicle/src/poseidon/poseidon.cpp
Normal file
@@ -0,0 +1,47 @@
|
||||
#define FIELD_ID BN254
|
||||
#include "../../include/fields/field_config.cuh"
|
||||
|
||||
using namespace field_config;
|
||||
|
||||
#include "../../include/poseidon/poseidon.cuh"
|
||||
#include "constants.cu"
|
||||
#include "kernels.cu"
|
||||
typedef int cudaError_t;
|
||||
namespace poseidon {
|
||||
template <typename S, int T>
|
||||
cudaError_t
|
||||
permute_many(S* states, size_t number_of_states, const PoseidonConstants<S>& constants, int& stream)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename S, int T>
|
||||
cudaError_t poseidon_hash(
|
||||
S* input, S* output, size_t number_of_states, const PoseidonConstants<S>& constants, const PoseidonConfig& config)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_hash_cuda)(
|
||||
scalar_t* input,
|
||||
scalar_t* output,
|
||||
int number_of_states,
|
||||
int arity,
|
||||
const PoseidonConstants<scalar_t>& constants,
|
||||
PoseidonConfig& config)
|
||||
{
|
||||
switch (arity) {
|
||||
case 2:
|
||||
return poseidon_hash<scalar_t, 3>(input, output, number_of_states, constants, config);
|
||||
case 4:
|
||||
return poseidon_hash<scalar_t, 5>(input, output, number_of_states, constants, config);
|
||||
case 8:
|
||||
return poseidon_hash<scalar_t, 9>(input, output, number_of_states, constants, config);
|
||||
case 11:
|
||||
return poseidon_hash<scalar_t, 12>(input, output, number_of_states, constants, config);
|
||||
default:
|
||||
THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "PoseidonHash: #arity must be one of [2, 4, 8, 11]");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
} // namespace poseidon
|
||||
27
icicle/src/vec_ops/CMakeLists.txt
Normal file
27
icicle/src/vec_ops/CMakeLists.txt
Normal file
@@ -0,0 +1,27 @@
|
||||
set(TARGET icicle_vec_ops)
|
||||
set(CURVE_TARGET icicle_curve)
|
||||
set(FIELD_TARGET icicle_field)
|
||||
|
||||
set(SRC ../)
|
||||
|
||||
set(VEC_OPS_SOURCE ${SRC}/vec_ops/extern.cpp)
|
||||
set(VEC_OPS_EXT_SOURCE ${SRC}/vec_ops/extern_extension.cpp)
|
||||
# if(MSM)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
|
||||
# if(G2)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
|
||||
# endif()
|
||||
# endif()
|
||||
# if(ECNTT)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
|
||||
# list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
|
||||
# endif()
|
||||
|
||||
add_library(${TARGET} STATIC ${VEC_OPS_SOURCE})
|
||||
add_library(${TARGET} STATIC ${VEC_OPS_EXT_SOURCE})
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
|
||||
target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
|
||||
target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
|
||||
target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
|
||||
target_compile_features(${TARGET} PUBLIC cxx_std_17)
|
||||
63
icicle/src/vec_ops/extern.cpp
Normal file
63
icicle/src/vec_ops/extern.cpp
Normal file
@@ -0,0 +1,63 @@
|
||||
#define FIELD_ID BN254
|
||||
#include "../../include/fields/field_config.cuh"
|
||||
|
||||
using namespace field_config;
|
||||
|
||||
#include "../../include/utils/utils.h"
|
||||
#include "vec_ops.cu"
|
||||
|
||||
namespace vec_ops {
|
||||
/**
|
||||
* Extern version of [Mul](@ref Mul) function with the template parameters
|
||||
* `S` and `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
|
||||
* or standalone "STARK field" given by `-DFIELD`).
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t
|
||||
CONCAT_EXPAND(FIELD, mul_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
|
||||
{
|
||||
return mul<scalar_t>(vec_a, vec_b, n, config, result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extern version of [Add](@ref Add) function with the template parameter
|
||||
* `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
|
||||
* or standalone "STARK field" given by `-DFIELD`).
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t
|
||||
CONCAT_EXPAND(FIELD, add_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
|
||||
{
|
||||
return add<scalar_t>(vec_a, vec_b, n, config, result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extern version of [Sub](@ref Sub) function with the template parameter
|
||||
* `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
|
||||
* or standalone "STARK field" given by `-DFIELD`).
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t
|
||||
CONCAT_EXPAND(FIELD, sub_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
|
||||
{
|
||||
return sub<scalar_t>(vec_a, vec_b, n, config, result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extern version of transpose_batch function with the template parameter
|
||||
* `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
|
||||
* or standalone "STARK field" given by `-DFIELD`).
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(FIELD, transpose_matrix_cuda)(
|
||||
const scalar_t* input,
|
||||
uint32_t row_size,
|
||||
uint32_t column_size,
|
||||
scalar_t* output,
|
||||
device_context::DeviceContext& ctx,
|
||||
bool on_device,
|
||||
bool is_async)
|
||||
{
|
||||
return transpose_matrix<scalar_t>(input, output, row_size, column_size, ctx, on_device, is_async);
|
||||
}
|
||||
} // namespace vec_ops
|
||||
60
icicle/src/vec_ops/extern_extension.cpp
Normal file
60
icicle/src/vec_ops/extern_extension.cpp
Normal file
@@ -0,0 +1,60 @@
|
||||
#define FIELD_ID BABY_BEAR
|
||||
#include "../../include/fields/field_config.cuh"
|
||||
|
||||
using namespace field_config;
|
||||
|
||||
#include "../../include/utils/utils.h"
|
||||
#include "vec_ops.cu"
|
||||
|
||||
namespace vec_ops {
|
||||
/**
|
||||
* Extern version of [Mul](@ref Mul) function with the template parameters
|
||||
* `S` and `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable
|
||||
* during build.
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_mul_cuda)(
|
||||
extension_t* vec_a, extension_t* vec_b, int n, VecOpsConfig& config, extension_t* result)
|
||||
{
|
||||
return mul<extension_t>(vec_a, vec_b, n, config, result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extern version of [Add](@ref Add) function with the template parameter
|
||||
* `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_add_cuda)(
|
||||
extension_t* vec_a, extension_t* vec_b, int n, VecOpsConfig& config, extension_t* result)
|
||||
{
|
||||
return add<extension_t>(vec_a, vec_b, n, config, result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extern version of [Sub](@ref Sub) function with the template parameter
|
||||
* `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_sub_cuda)(
|
||||
extension_t* vec_a, extension_t* vec_b, int n, VecOpsConfig& config, extension_t* result)
|
||||
{
|
||||
return sub<extension_t>(vec_a, vec_b, n, config, result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extern version of transpose_batch function with the template parameter
|
||||
* `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_transpose_matrix_cuda)(
|
||||
const extension_t* input,
|
||||
uint32_t row_size,
|
||||
uint32_t column_size,
|
||||
extension_t* output,
|
||||
device_context::DeviceContext& ctx,
|
||||
bool on_device,
|
||||
bool is_async)
|
||||
{
|
||||
return transpose_matrix<extension_t>(input, output, row_size, column_size, ctx, on_device, is_async);
|
||||
}
|
||||
} // namespace vec_ops
|
||||
@@ -1,9 +1,8 @@
|
||||
#include <cuda.h>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "vec_ops/vec_ops.cuh"
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
#include "utils/mont.cuh"
|
||||
#include "../../include/vec_ops/vec_ops.cuh"
|
||||
#include "../../include/gpu-utils/device_context.cuh"
|
||||
// #include "utils/mont.cuh"
|
||||
|
||||
namespace vec_ops {
|
||||
|
||||
@@ -12,97 +11,47 @@ namespace vec_ops {
|
||||
#define MAX_THREADS_PER_BLOCK 256
|
||||
|
||||
template <typename E>
|
||||
__global__ void mul_kernel(const E* scalar_vec, const E* element_vec, int n, E* result)
|
||||
void mul_kernel(const E* scalar_vec, const E* element_vec, int n, E* result)
|
||||
{
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid < n) { result[tid] = scalar_vec[tid] * element_vec[tid]; }
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
__global__ void mul_scalar_kernel(const E* element_vec, const S scalar, int n, E* result)
|
||||
void mul_scalar_kernel(const E* element_vec, const S scalar, int n, E* result)
|
||||
{
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < n) { result[tid] = element_vec[tid] * (scalar); }
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename E>
|
||||
__global__ void div_element_wise_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
|
||||
void div_element_wise_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
|
||||
{
|
||||
// TODO:implement better based on https://eprint.iacr.org/2008/199
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < n) { result[tid] = element_vec1[tid] * E::inverse(element_vec2[tid]); }
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename E>
|
||||
__global__ void add_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
|
||||
void add_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
|
||||
{
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < n) { result[tid] = element_vec1[tid] + element_vec2[tid]; }
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename E>
|
||||
__global__ void sub_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
|
||||
void sub_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
|
||||
{
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < n) { result[tid] = element_vec1[tid] - element_vec2[tid]; }
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename E>
|
||||
__global__ void transpose_kernel(const E* in, E* out, uint32_t row_size, uint32_t column_size)
|
||||
void transpose_kernel(const E* in, E* out, uint32_t row_size, uint32_t column_size)
|
||||
{
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid >= row_size * column_size) return;
|
||||
out[(tid % row_size) * column_size + (tid / row_size)] = in[tid];
|
||||
return;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
typedef int cudaError_t;
|
||||
template <typename E, void (*Kernel)(const E*, const E*, int, E*)>
|
||||
cudaError_t vec_op(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
|
||||
{
|
||||
CHK_INIT_IF_RETURN();
|
||||
|
||||
// Set the grid and block dimensions
|
||||
int num_threads = MAX_THREADS_PER_BLOCK;
|
||||
int num_blocks = (n + num_threads - 1) / num_threads;
|
||||
|
||||
E *d_result, *d_alloc_vec_a, *d_alloc_vec_b;
|
||||
const E *d_vec_a, *d_vec_b;
|
||||
if (!config.is_a_on_device) {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
|
||||
d_vec_a = d_alloc_vec_a;
|
||||
} else {
|
||||
d_vec_a = vec_a;
|
||||
}
|
||||
|
||||
if (!config.is_b_on_device) {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
|
||||
d_vec_b = d_alloc_vec_b;
|
||||
} else {
|
||||
d_vec_b = vec_b;
|
||||
}
|
||||
|
||||
if (!config.is_result_on_device) {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
|
||||
} else {
|
||||
d_result = result;
|
||||
}
|
||||
|
||||
// Call the kernel to perform element-wise operation
|
||||
Kernel<<<num_blocks, num_threads, 0, config.ctx.stream>>>(d_vec_a, d_vec_b, n, d_result);
|
||||
|
||||
if (!config.is_a_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
|
||||
if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
|
||||
|
||||
if (!config.is_result_on_device) {
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream));
|
||||
}
|
||||
|
||||
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream));
|
||||
|
||||
return CHK_LAST();
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename E>
|
||||
@@ -133,35 +82,6 @@ namespace vec_ops {
|
||||
bool on_device,
|
||||
bool is_async)
|
||||
{
|
||||
int number_of_threads = MAX_THREADS_PER_BLOCK;
|
||||
int number_of_blocks = (row_size * column_size + number_of_threads - 1) / number_of_threads;
|
||||
cudaStream_t stream = ctx.stream;
|
||||
|
||||
const E* d_mat_in;
|
||||
E* d_allocated_input = nullptr;
|
||||
E* d_mat_out;
|
||||
if (!on_device) {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_input, row_size * column_size * sizeof(E), ctx.stream));
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(
|
||||
d_allocated_input, mat_in, row_size * column_size * sizeof(E), cudaMemcpyHostToDevice, ctx.stream));
|
||||
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_mat_out, row_size * column_size * sizeof(E), ctx.stream));
|
||||
d_mat_in = d_allocated_input;
|
||||
} else {
|
||||
d_mat_in = mat_in;
|
||||
d_mat_out = mat_out;
|
||||
}
|
||||
|
||||
transpose_kernel<<<number_of_blocks, number_of_threads, 0, stream>>>(d_mat_in, d_mat_out, row_size, column_size);
|
||||
|
||||
if (!on_device) {
|
||||
CHK_IF_RETURN(
|
||||
cudaMemcpyAsync(mat_out, d_mat_out, row_size * column_size * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream));
|
||||
CHK_IF_RETURN(cudaFreeAsync(d_mat_out, ctx.stream));
|
||||
CHK_IF_RETURN(cudaFreeAsync(d_allocated_input, ctx.stream));
|
||||
}
|
||||
if (!is_async) return CHK_STICKY(cudaStreamSynchronize(ctx.stream));
|
||||
|
||||
return CHK_LAST();
|
||||
return 0;
|
||||
}
|
||||
} // namespace vec_ops
|
||||
@@ -514,20 +514,20 @@ namespace msm {
|
||||
class dummy_g2_t : public scalar_t
|
||||
{
|
||||
public:
|
||||
static constexpr __host__ __device__ dummy_g2_t to_affine(const dummy_g2_t& point) { return point; }
|
||||
static constexpr __host__ dummy_g2_t to_affine(const dummy_g2_t& point) { return point; }
|
||||
|
||||
static constexpr __host__ __device__ dummy_g2_t from_affine(const dummy_g2_t& point) { return point; }
|
||||
static constexpr __host__ dummy_g2_t from_affine(const dummy_g2_t& point) { return point; }
|
||||
|
||||
static constexpr __host__ __device__ dummy_g2_t generator() { return dummy_g2_t{scalar_t::one()}; }
|
||||
static constexpr __host__ dummy_g2_t generator() { return dummy_g2_t{scalar_t::one()}; }
|
||||
|
||||
static __host__ __device__ dummy_g2_t zero() { return dummy_g2_t{scalar_t::zero()}; }
|
||||
static __host__ dummy_g2_t zero() { return dummy_g2_t{scalar_t::zero()}; }
|
||||
|
||||
friend __host__ __device__ dummy_g2_t operator*(const scalar_t& xs, const dummy_g2_t& ys)
|
||||
friend __host__ dummy_g2_t operator*(const scalar_t& xs, const dummy_g2_t& ys)
|
||||
{
|
||||
return dummy_g2_t{scalar_t::reduce(scalar_t::mul_wide(xs, ys))};
|
||||
}
|
||||
|
||||
friend __host__ __device__ dummy_g2_t operator+(const dummy_g2_t& xs, const dummy_g2_t& ys)
|
||||
friend __host__ dummy_g2_t operator+(const dummy_g2_t& xs, const dummy_g2_t& ys)
|
||||
{
|
||||
scalar_t rs = {};
|
||||
scalar_t::add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
|
||||
|
||||
Reference in New Issue
Block a user