Compare commits

...

8 Commits

Author SHA1 Message Date
IdoAtlas
42cffb1c88 vec ops compiles 2024-05-12 14:01:17 +03:00
IdoAtlas
d3274a9eaa poseidon compiles 2024-05-12 13:43:47 +03:00
IdoAtlas
d31a7019fe polynomial compiles 2024-05-12 13:28:11 +03:00
IdoAtlas
84a0d3c348 ntt compiled 2024-05-12 12:55:07 +03:00
IdoAtlas
eb87970325 msm compiled 2024-05-09 11:31:22 +03:00
IdoAtlas
a9081aabbf build hash 2024-05-09 10:59:44 +03:00
IdoAtlas
b564c6670d curve build 2024-05-09 10:45:01 +03:00
Otsar
1f9f3f13ea cuda cpu empty mock, field compiles 2024-05-09 10:02:59 +03:00
67 changed files with 3219 additions and 1779 deletions

View File

@@ -119,6 +119,7 @@ This will ensure our custom hooks are run and will make it easier to follow our
- [nonam3e](https://github.com/nonam3e), for adding Grumpkin curve support into ICICLE
- [alxiong](https://github.com/alxiong), for adding warmup for CudaStream
- [cyl19970726](https://github.com/cyl19970726), for updating go install source in Dockerfile
- [PatStiles](https://github.com/PatStiles), for adding Stark252 field
## Help & Support

View File

@@ -88,7 +88,7 @@ void point_near_x(T x, affine_t *point) {
}
static int seed = 0;
static HOST_INLINE T rand_host_seed()
static T rand_host_seed()
{
std::mt19937_64 generator(seed++);
std::uniform_int_distribution<unsigned> distribution;

View File

@@ -1,7 +1,7 @@
#pragma once
#include "gpu-utils/sharedmem.cuh"
#include "gpu-utils/modifiers.cuh"
#include "../gpu-utils/sharedmem.cuh"
#include "../gpu-utils/modifiers.cuh"
#include <iostream>
template <class FF>
@@ -11,26 +11,26 @@ public:
FF x;
FF y;
static HOST_DEVICE_INLINE Affine neg(const Affine& point) { return {point.x, FF::neg(point.y)}; }
static Affine neg(const Affine& point) { return {point.x, FF::neg(point.y)}; }
static HOST_DEVICE_INLINE Affine zero() { return {FF::zero(), FF::zero()}; }
static Affine zero() { return {FF::zero(), FF::zero()}; }
static HOST_DEVICE_INLINE Affine to_montgomery(const Affine& point)
static Affine to_montgomery(const Affine& point)
{
return {FF::to_montgomery(point.x), FF::to_montgomery(point.y)};
}
static HOST_DEVICE_INLINE Affine from_montgomery(const Affine& point)
static Affine from_montgomery(const Affine& point)
{
return {FF::from_montgomery(point.x), FF::from_montgomery(point.y)};
}
friend HOST_DEVICE_INLINE bool operator==(const Affine& xs, const Affine& ys)
friend bool operator==(const Affine& xs, const Affine& ys)
{
return (xs.x == ys.x) && (xs.y == ys.y);
}
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Affine& point)
friend std::ostream& operator<<(std::ostream& os, const Affine& point)
{
os << "x: " << point.x << "; y: " << point.y;
return os;
@@ -39,9 +39,9 @@ public:
template <class FF>
struct SharedMemory<Affine<FF>> {
__device__ Affine<FF>* getPointer()
Affine<FF>* getPointer()
{
extern __shared__ Affine<FF> s_affine_[];
Affine<FF> *s_affine_ = nullptr;
return s_affine_;
}
};

View File

@@ -1,9 +1,9 @@
#pragma once
#pragma once
#ifndef CURVE_CONFIG_H
#define CURVE_CONFIG_H
#include "fields/id.h"
#include "curves/projective.cuh"
#include "../fields/id.h"
#include "projective.cuh"
/**
* @namespace curve_config
@@ -12,23 +12,23 @@
* with the `-DCURVE` env variable passed during build.
*/
#if CURVE_ID == BN254
#include "curves/params/bn254.cuh"
#include "params/bn254.cuh"
namespace curve_config = bn254;
#elif CURVE_ID == BLS12_381
#include "curves/params/bls12_381.cuh"
#include "params/bls12_381.cuh"
namespace curve_config = bls12_381;
#elif CURVE_ID == BLS12_377
#include "curves/params/bls12_377.cuh"
#include "params/bls12_377.cuh"
namespace curve_config = bls12_377;
#elif CURVE_ID == BW6_761
#include "curves/params/bw6_761.cuh"
#include "params/bw6_761.cuh"
namespace curve_config = bw6_761;
#elif CURVE_ID == GRUMPKIN
#include "curves/params/grumpkin.cuh"
#include "params/grumpkin.cuh"
namespace curve_config = grumpkin;
#endif
#endif

View File

@@ -2,13 +2,13 @@
#ifndef BN254_PARAMS_H
#define BN254_PARAMS_H
#include "fields/storage.cuh"
#include "../../fields/storage.cuh"
#include "curves/macro.h"
#include "curves/projective.cuh"
#include "fields/snark_fields/bn254_base.cuh"
#include "fields/snark_fields/bn254_scalar.cuh"
#include "fields/quadratic_extension.cuh"
#include "../macro.h"
#include "../projective.cuh"
#include "../../fields/snark_fields/bn254_base.cuh"
#include "../../fields/snark_fields/bn254_scalar.cuh"
#include "../../fields/quadratic_extension.cuh"
namespace bn254 {
// G1 and G2 generators

View File

@@ -1,7 +1,7 @@
#pragma once
#include "affine.cuh"
#include "gpu-utils/sharedmem.cuh"
#include "../gpu-utils/sharedmem.cuh"
template <typename FF, class SCALAR_FF, const FF& B_VALUE, const FF& GENERATOR_X, const FF& GENERATOR_Y>
class Projective
@@ -19,34 +19,34 @@ public:
FF y;
FF z;
static HOST_DEVICE_INLINE Projective zero() { return {FF::zero(), FF::one(), FF::zero()}; }
static Projective zero() { return {FF::zero(), FF::one(), FF::zero()}; }
static HOST_DEVICE_INLINE Affine<FF> to_affine(const Projective& point)
static Affine<FF> to_affine(const Projective& point)
{
FF denom = FF::inverse(point.z);
return {point.x * denom, point.y * denom};
}
static HOST_DEVICE_INLINE Projective from_affine(const Affine<FF>& point)
static Projective from_affine(const Affine<FF>& point)
{
return point == Affine<FF>::zero() ? zero() : Projective{point.x, point.y, FF::one()};
}
static HOST_DEVICE_INLINE Projective to_montgomery(const Projective& point)
static Projective to_montgomery(const Projective& point)
{
return {FF::to_montgomery(point.x), FF::to_montgomery(point.y), FF::to_montgomery(point.z)};
}
static HOST_DEVICE_INLINE Projective from_montgomery(const Projective& point)
static Projective from_montgomery(const Projective& point)
{
return {FF::from_montgomery(point.x), FF::from_montgomery(point.y), FF::from_montgomery(point.z)};
}
static HOST_DEVICE_INLINE Projective generator() { return {GENERATOR_X, GENERATOR_Y, FF::one()}; }
static Projective generator() { return {GENERATOR_X, GENERATOR_Y, FF::one()}; }
static HOST_DEVICE_INLINE Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; }
static Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; }
static HOST_DEVICE_INLINE Projective dbl(const Projective& point)
static Projective dbl(const Projective& point)
{
const FF X = point.x;
const FF Y = point.y;
@@ -74,7 +74,7 @@ public:
return {X3, Y3, Z3};
}
friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Projective& p2)
friend Projective operator+(Projective p1, const Projective& p2)
{
const FF X1 = p1.x; // < 2
const FF Y1 = p1.y; // < 2
@@ -118,9 +118,9 @@ public:
return {X3, Y3, Z3};
}
friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Projective& p2) { return p1 + neg(p2); }
friend Projective operator-(Projective p1, const Projective& p2) { return p1 + neg(p2); }
friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Affine<FF>& p2)
friend Projective operator+(Projective p1, const Affine<FF>& p2)
{
const FF X1 = p1.x; // < 2
const FF Y1 = p1.y; // < 2
@@ -163,12 +163,12 @@ public:
return {X3, Y3, Z3};
}
friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Affine<FF>& p2)
friend Projective operator-(Projective p1, const Affine<FF>& p2)
{
return p1 + Affine<FF>::neg(p2);
}
friend HOST_DEVICE_INLINE Projective operator*(SCALAR_FF scalar, const Projective& point)
friend Projective operator*(SCALAR_FF scalar, const Projective& point)
{
Projective res = zero();
#ifdef __CUDA_ARCH__
@@ -181,27 +181,27 @@ public:
return res;
}
friend HOST_DEVICE_INLINE Projective operator*(const Projective& point, SCALAR_FF scalar) { return scalar * point; }
friend Projective operator*(const Projective& point, SCALAR_FF scalar) { return scalar * point; }
friend HOST_DEVICE_INLINE bool operator==(const Projective& p1, const Projective& p2)
friend bool operator==(const Projective& p1, const Projective& p2)
{
return (p1.x * p2.z == p2.x * p1.z) && (p1.y * p2.z == p2.y * p1.z);
}
friend HOST_DEVICE_INLINE bool operator!=(const Projective& p1, const Projective& p2) { return !(p1 == p2); }
friend bool operator!=(const Projective& p1, const Projective& p2) { return !(p1 == p2); }
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Projective& point)
friend std::ostream& operator<<(std::ostream& os, const Projective& point)
{
os << "Point { x: " << point.x << "; y: " << point.y << "; z: " << point.z << " }";
return os;
}
static HOST_DEVICE_INLINE bool is_zero(const Projective& point)
static bool is_zero(const Projective& point)
{
return point.x == FF::zero() && point.y != FF::zero() && point.z == FF::zero();
}
static HOST_DEVICE_INLINE bool is_on_curve(const Projective& point)
static bool is_on_curve(const Projective& point)
{
if (is_zero(point)) return true;
bool eq_holds =
@@ -210,7 +210,7 @@ public:
return point.z != FF::zero() && eq_holds;
}
static HOST_INLINE Projective rand_host()
static Projective rand_host()
{
SCALAR_FF rand_scalar = SCALAR_FF::rand_host();
return rand_scalar * generator();
@@ -231,9 +231,9 @@ public:
template <typename FF, class SCALAR_FF, const FF& B_VALUE, const FF& GENERATOR_X, const FF& GENERATOR_Y>
struct SharedMemory<Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>> {
__device__ Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>* getPointer()
Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>* getPointer()
{
extern __shared__ Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y> s_projective_[];
Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y> *s_projective_ = nullptr;
return s_projective_;
}
};

View File

@@ -18,9 +18,9 @@
#pragma once
#include "gpu-utils/error_handler.cuh"
#include "gpu-utils/modifiers.cuh"
#include "gpu-utils/sharedmem.cuh"
#include "../gpu-utils/error_handler.cuh"
#include "../gpu-utils/modifiers.cuh"
#include "../gpu-utils/sharedmem.cuh"
#include "host_math.cuh"
#include "ptx.cuh"
#include "storage.cuh"
@@ -38,11 +38,11 @@ public:
static constexpr unsigned TLC = CONFIG::limbs_count;
static constexpr unsigned NBITS = CONFIG::modulus_bit_count;
static constexpr HOST_DEVICE_INLINE Field zero() { return Field{CONFIG::zero}; }
static constexpr Field zero() { return Field{CONFIG::zero}; }
static constexpr HOST_DEVICE_INLINE Field one() { return Field{CONFIG::one}; }
static constexpr Field one() { return Field{CONFIG::one}; }
static constexpr HOST_DEVICE_INLINE Field from(uint32_t value)
static constexpr Field from(uint32_t value)
{
storage<TLC> scalar;
scalar.limbs[0] = value;
@@ -52,7 +52,7 @@ public:
return Field{scalar};
}
static HOST_INLINE Field omega(uint32_t logn)
static Field omega(uint32_t logn)
{
if (logn == 0) { return Field{CONFIG::one}; }
@@ -62,7 +62,7 @@ public:
return Field{omega.storages[logn - 1]};
}
static HOST_INLINE Field omega_inv(uint32_t logn)
static Field omega_inv(uint32_t logn)
{
if (logn == 0) { return Field{CONFIG::one}; }
@@ -74,7 +74,7 @@ public:
return Field{omega_inv.storages[logn - 1]};
}
static HOST_DEVICE_INLINE Field inv_log_size(uint32_t logn)
static Field inv_log_size(uint32_t logn)
{
if (logn == 0) { return Field{CONFIG::one}; }
#ifndef __CUDA_ARCH__
@@ -91,7 +91,7 @@ public:
return Field{inv.storages[logn - 1]};
}
static constexpr HOST_INLINE unsigned get_omegas_count()
static constexpr unsigned get_omegas_count()
{
if constexpr (has_member_omegas_count<CONFIG>()) {
return CONFIG::omegas_count;
@@ -113,45 +113,45 @@ public:
/**
* A new addition to the config file - \f$ 2^{32 \cdot num\_limbs} - p \f$.
*/
static constexpr HOST_DEVICE_INLINE ff_storage get_neg_modulus() { return CONFIG::neg_modulus; }
static constexpr ff_storage get_neg_modulus() { return CONFIG::neg_modulus; }
/**
* A new addition to the config file - the number of times to reduce in [reduce](@ref reduce) function.
*/
static constexpr HOST_DEVICE_INLINE unsigned num_of_reductions() { return CONFIG::num_of_reductions; }
static constexpr unsigned num_of_reductions() { return CONFIG::num_of_reductions; }
static constexpr unsigned slack_bits = 32 * TLC - NBITS;
struct Wide {
ff_wide_storage limbs_storage;
static constexpr Field HOST_DEVICE_INLINE get_lower(const Wide& xs)
static constexpr Field get_lower(const Wide& xs)
{
Field out{};
#ifdef __CUDA_ARCH__
UNROLL
#endif
for (unsigned i = 0; i < TLC; i++)
out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
return out;
}
static constexpr Field HOST_DEVICE_INLINE get_higher(const Wide& xs)
static constexpr Field get_higher(const Wide& xs)
{
Field out{};
#ifdef __CUDA_ARCH__
UNROLL
#endif
for (unsigned i = 0; i < TLC; i++)
out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i + TLC];
return out;
}
static constexpr Field HOST_DEVICE_INLINE get_higher_with_slack(const Wide& xs)
static constexpr Field get_higher_with_slack(const Wide& xs)
{
Field out{};
#ifdef __CUDA_ARCH__
UNROLL
#endif
for (unsigned i = 0; i < TLC; i++) {
#ifdef __CUDA_ARCH__
@@ -166,7 +166,7 @@ public:
}
template <unsigned REDUCTION_SIZE = 1>
static constexpr HOST_DEVICE_INLINE Wide sub_modulus_squared(const Wide& xs)
static constexpr Wide sub_modulus_squared(const Wide& xs)
{
if (REDUCTION_SIZE == 0) return xs;
const ff_wide_storage modulus = get_modulus_squared<REDUCTION_SIZE>();
@@ -175,7 +175,7 @@ public:
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE Wide neg(const Wide& xs)
static constexpr Wide neg(const Wide& xs)
{
const ff_wide_storage modulus = get_modulus_squared<MODULUS_MULTIPLE>();
Wide rs = {};
@@ -183,14 +183,14 @@ public:
return rs;
}
friend HOST_DEVICE_INLINE Wide operator+(Wide xs, const Wide& ys)
friend Wide operator+(Wide xs, const Wide& ys)
{
Wide rs = {};
add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
return sub_modulus_squared<1>(rs);
}
friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys)
friend Wide operator-(Wide xs, const Wide& ys)
{
Wide rs = {};
uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
@@ -203,7 +203,7 @@ public:
// return modulus multiplied by 1, 2 or 4
template <unsigned MULTIPLIER = 1>
static constexpr HOST_DEVICE_INLINE ff_storage get_modulus()
static constexpr ff_storage get_modulus()
{
switch (MULTIPLIER) {
case 1:
@@ -218,17 +218,17 @@ public:
}
template <unsigned MULTIPLIER = 1>
static constexpr HOST_DEVICE_INLINE ff_wide_storage modulus_wide()
static constexpr ff_wide_storage modulus_wide()
{
return CONFIG::modulus_wide;
}
// return m
static constexpr HOST_DEVICE_INLINE ff_storage get_m() { return CONFIG::m; }
static constexpr ff_storage get_m() { return CONFIG::m; }
// return modulus^2, helpful for ab +/- cd
template <unsigned MULTIPLIER = 1>
static constexpr HOST_DEVICE_INLINE ff_wide_storage get_modulus_squared()
static constexpr ff_wide_storage get_modulus_squared()
{
switch (MULTIPLIER) {
case 1:
@@ -243,7 +243,7 @@ public:
}
template <bool SUBTRACT, bool CARRY_OUT>
static constexpr DEVICE_INLINE uint32_t
static constexpr uint32_t
add_sub_u32_device(const uint32_t* x, const uint32_t* y, uint32_t* r, size_t n = (TLC >> 1))
{
r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]);
@@ -258,7 +258,7 @@ public:
// add or subtract limbs
template <bool SUBTRACT, bool CARRY_OUT>
static constexpr DEVICE_INLINE uint32_t
static constexpr uint32_t
add_sub_limbs_device(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
{
const uint32_t* x = xs.limbs;
@@ -268,7 +268,7 @@ public:
}
template <bool SUBTRACT, bool CARRY_OUT>
static constexpr DEVICE_INLINE uint32_t
static constexpr uint32_t
add_sub_limbs_device(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs)
{
const uint32_t* x = xs.limbs;
@@ -278,7 +278,7 @@ public:
}
template <bool SUBTRACT, bool CARRY_OUT>
static constexpr HOST_INLINE uint32_t add_sub_limbs_host(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
static constexpr uint32_t add_sub_limbs_host(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
{
const uint32_t* x = xs.limbs;
const uint32_t* y = ys.limbs;
@@ -291,7 +291,7 @@ public:
}
template <bool SUBTRACT, bool CARRY_OUT>
static constexpr HOST_INLINE uint32_t
static constexpr uint32_t
add_sub_limbs_host(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs)
{
const uint32_t* x = xs.limbs;
@@ -305,7 +305,7 @@ public:
}
template <bool CARRY_OUT, typename T>
static constexpr HOST_DEVICE_INLINE uint32_t add_limbs(const T& xs, const T& ys, T& rs)
static constexpr uint32_t add_limbs(const T& xs, const T& ys, T& rs)
{
#ifdef __CUDA_ARCH__
return add_sub_limbs_device<false, CARRY_OUT>(xs, ys, rs);
@@ -315,7 +315,7 @@ public:
}
template <bool CARRY_OUT, typename T>
static constexpr HOST_DEVICE_INLINE uint32_t sub_limbs(const T& xs, const T& ys, T& rs)
static constexpr uint32_t sub_limbs(const T& xs, const T& ys, T& rs)
{
#ifdef __CUDA_ARCH__
return add_sub_limbs_device<true, CARRY_OUT>(xs, ys, rs);
@@ -324,18 +324,18 @@ public:
#endif
}
static DEVICE_INLINE void mul_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
static void mul_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
{
UNROLL
for (size_t i = 0; i < n; i += 2) {
acc[i] = ptx::mul_lo(a[i], bi);
acc[i + 1] = ptx::mul_hi(a[i], bi);
}
}
static DEVICE_INLINE void mul_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t start_i = 0)
static void mul_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t start_i = 0)
{
UNROLL
for (size_t i = start_i; i < n; i += 2) {
acc[i] = ptx::mul_lo(a[i], bi);
acc[i + 1] = ptx::mul_hi(a[i], bi);
@@ -343,14 +343,14 @@ public:
}
template <bool CARRY_IN = false>
static DEVICE_INLINE void
static void
cmad_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, uint32_t optional_carry = 0)
{
if (CARRY_IN) ptx::add_cc(UINT32_MAX, optional_carry);
acc[0] = CARRY_IN ? ptx::madc_lo_cc(a[0], bi, acc[0]) : ptx::mad_lo_cc(a[0], bi, acc[0]);
acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]);
UNROLL
for (size_t i = 2; i < n; i += 2) {
acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
@@ -358,7 +358,7 @@ public:
}
template <bool EVEN_PHASE>
static DEVICE_INLINE void cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
static void cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
{
if (EVEN_PHASE) {
acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
@@ -367,14 +367,14 @@ public:
acc[1] = ptx::mad_hi_cc(a[0], bi, acc[1]);
}
UNROLL
for (size_t i = 2; i < n; i += 2) {
acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
}
}
static DEVICE_INLINE void cmad_n_lsb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
static void cmad_n_lsb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
{
if (n > 1)
acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
@@ -382,7 +382,7 @@ public:
acc[0] = ptx::mad_lo(a[0], bi, acc[0]);
size_t i;
UNROLL
for (i = 1; i < n - 1; i += 2) {
acc[i] = ptx::madc_hi_cc(a[i - 1], bi, acc[i]);
if (i == n - 2)
@@ -394,7 +394,7 @@ public:
}
template <bool CARRY_OUT = false, bool CARRY_IN = false>
static DEVICE_INLINE uint32_t mad_row(
static uint32_t mad_row(
uint32_t* odd,
uint32_t* even,
const uint32_t* a,
@@ -419,7 +419,7 @@ public:
}
template <bool EVEN_PHASE>
static DEVICE_INLINE void mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
static void mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
{
cmad_n_msb<!EVEN_PHASE>(odd, EVEN_PHASE ? a : (a + 1), bi, n - 2);
odd[EVEN_PHASE ? (n - 1) : (n - 2)] = ptx::madc_lo_cc(a[n - 1], bi, 0);
@@ -428,7 +428,7 @@ public:
odd[EVEN_PHASE ? n : (n - 1)] = ptx::addc(odd[EVEN_PHASE ? n : (n - 1)], 0);
}
static DEVICE_INLINE void mad_row_lsb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
static void mad_row_lsb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
{
// bi here is constant so we can do a compile-time check for zero (which does happen once for bls12-381 scalar field
// modulus)
@@ -439,12 +439,12 @@ public:
return;
}
static DEVICE_INLINE uint32_t
static uint32_t
mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, uint32_t* extra, size_t n = (TLC >> 1))
{
acc[0] = ptx::mad_lo_cc(a[0], bi, extra[0]);
UNROLL
for (size_t i = 1; i < n - 1; i += 2) {
acc[i] = ptx::madc_hi_cc(a[i - 1], bi, extra[i]);
acc[i + 1] = ptx::madc_lo_cc(a[i + 1], bi, extra[i + 1]);
@@ -467,19 +467,19 @@ public:
* \cdot b_0}{2^{32}}} + \dots + \floor{\frac{a_0 \cdot b_{TLC - 2}}{2^{32}}}) \leq 2^{64} + 2\cdot 2^{96} + \dots +
* (TLC - 2) \cdot 2^{32(TLC - 1)} + (TLC - 1) \cdot 2^{32(TLC - 1)} \leq 2(TLC - 1) \cdot 2^{32(TLC - 1)}\f$.
*/
static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
static void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
{
if constexpr (TLC > 1) {
const uint32_t* a = as.limbs;
const uint32_t* b = bs.limbs;
uint32_t* even = rs.limbs;
__align__(16) uint32_t odd[2 * TLC - 2];
uint32_t odd[2 * TLC - 2];
even[TLC - 1] = ptx::mul_hi(a[TLC - 2], b[0]);
odd[TLC - 2] = ptx::mul_lo(a[TLC - 1], b[0]);
odd[TLC - 1] = ptx::mul_hi(a[TLC - 1], b[0]);
size_t i;
UNROLL
for (i = 2; i < TLC - 1; i += 2) {
mad_row_msb<true>(&even[TLC - 2], &odd[TLC - 2], &a[TLC - i - 1], b[i - 1], i + 1);
mad_row_msb<false>(&odd[TLC - 2], &even[TLC - 2], &a[TLC - i - 2], b[i], i + 2);
@@ -504,7 +504,7 @@ public:
* is excluded if \f$ i + j > TLC - 1 \f$ and only the lower half is included if \f$ i + j = TLC - 1 \f$. All other
* limb products are included.
*/
static DEVICE_INLINE void
static void
multiply_and_add_lsb_neg_modulus_raw_device(const ff_storage& as, ff_storage& cs, ff_storage& rs)
{
ff_storage bs = get_neg_modulus();
@@ -514,7 +514,7 @@ public:
uint32_t* even = rs.limbs;
if constexpr (TLC > 2) {
__align__(16) uint32_t odd[TLC - 1];
uint32_t odd[TLC - 1];
size_t i;
// `b[0]` is \f$ 2^{32} \f$ minus the last limb of prime modulus. Because most scalar (and some base) primes
// are necessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to
@@ -528,7 +528,6 @@ public:
mul_n(odd, a + 1, b[0], TLC - 1);
}
mad_row_lsb(&even[2], &odd[0], a, b[1], TLC - 1);
UNROLL
for (i = 2; i < TLC - 1; i += 2) {
mad_row_lsb(&odd[i], &even[i], a, b[i], TLC - i);
mad_row_lsb(&even[i + 2], &odd[i], a, b[i + 1], TLC - i - 1);
@@ -558,15 +557,15 @@ public:
* that the top bit of \f$ a_{hi} \f$ and \f$ b_{hi} \f$ are unset. This ensures correctness by allowing to keep the
* result inside TLC limbs and ignore the carries from the highest limb.
*/
static DEVICE_INLINE void
static void
multiply_and_add_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even, uint32_t* in1, uint32_t* in2)
{
__align__(16) uint32_t odd[TLC - 2];
uint32_t odd[TLC - 2];
uint32_t first_row_carry = mul_n_and_add(even, a, b[0], in1);
uint32_t carry = mul_n_and_add(odd, a + 1, b[0], &in2[1]);
size_t i;
UNROLL
for (i = 2; i < ((TLC >> 1) - 1); i += 2) {
carry = mad_row<true, false>(
&even[i], &odd[i - 2], a, b[i - 1], TLC >> 1, in1[(TLC >> 1) + i - 2], in1[(TLC >> 1) + i - 1], carry);
@@ -587,15 +586,15 @@ public:
* This method multiplies `a` and `b` and writes the result into `even`. It assumes that `a` and `b` are TLC/2 limbs
* long. The usual schoolbook algorithm is used.
*/
static DEVICE_INLINE void multiply_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even)
static void multiply_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even)
{
__align__(16) uint32_t odd[TLC - 2];
uint32_t odd[TLC - 2];
mul_n(even, a, b[0], TLC >> 1);
mul_n(odd, a + 1, b[0], TLC >> 1);
mad_row(&even[2], &odd[0], a, b[1], TLC >> 1);
size_t i;
UNROLL
for (i = 2; i < ((TLC >> 1) - 1); i += 2) {
mad_row(&odd[i], &even[i], a, b[i], TLC >> 1);
mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC >> 1);
@@ -614,7 +613,7 @@ public:
* with so far. This method implements [subtractive
* Karatsuba](https://en.wikipedia.org/wiki/Karatsuba_algorithm#Implementation).
*/
static DEVICE_INLINE void multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
static void multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
{
const uint32_t* a = as.limbs;
const uint32_t* b = bs.limbs;
@@ -624,8 +623,8 @@ public:
// write the results into `r`.
multiply_short_raw_device(a, b, r);
multiply_short_raw_device(&a[TLC >> 1], &b[TLC >> 1], &r[TLC]);
__align__(16) uint32_t middle_part[TLC];
__align__(16) uint32_t diffs[TLC];
uint32_t middle_part[TLC];
uint32_t diffs[TLC];
// Differences of halves \f$ a_{hi} - a_{lo}; b_{lo} - b_{hi} \$f are written into `diffs`, signs written to
// `carry1` and `carry2`.
uint32_t carry1 = add_sub_u32_device<true, true>(&a[TLC >> 1], a, diffs);
@@ -644,7 +643,7 @@ public:
for (size_t i = TLC + (TLC >> 1); i < 2 * TLC; i++)
r[i] = ptx::addc_cc(r[i], 0);
} else if (TLC == 2) {
__align__(8) uint32_t odd[2];
uint32_t odd[2];
r[0] = ptx::mul_lo(a[0], b[0]);
r[1] = ptx::mul_hi(a[0], b[0]);
r[2] = ptx::mul_lo(a[1], b[1]);
@@ -662,7 +661,7 @@ public:
}
}
static HOST_INLINE void multiply_raw_host(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
static void multiply_raw_host(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
{
const uint32_t* a = as.limbs;
const uint32_t* b = bs.limbs;
@@ -675,7 +674,7 @@ public:
}
}
static HOST_DEVICE_INLINE void multiply_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
static void multiply_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
{
#ifdef __CUDA_ARCH__
return multiply_raw_device(as, bs, rs);
@@ -684,7 +683,7 @@ public:
#endif
}
static HOST_DEVICE_INLINE void
static void
multiply_and_add_lsb_neg_modulus_raw(const ff_storage& as, ff_storage& cs, ff_storage& rs)
{
#ifdef __CUDA_ARCH__
@@ -697,7 +696,7 @@ public:
#endif
}
static HOST_DEVICE_INLINE void multiply_msb_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
static void multiply_msb_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
{
#ifdef __CUDA_ARCH__
return multiply_msb_raw_device(as, bs, rs);
@@ -709,9 +708,9 @@ public:
public:
ff_storage limbs_storage;
HOST_DEVICE_INLINE uint32_t* export_limbs() { return (uint32_t*)limbs_storage.limbs; }
uint32_t* export_limbs() { return (uint32_t*)limbs_storage.limbs; }
HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
{
const uint32_t limb_lsb_idx = (digit_num * digit_width) / 32;
const uint32_t shift_bits = (digit_num * digit_width) % 32;
@@ -723,7 +722,7 @@ public:
return rv;
}
static HOST_INLINE Field rand_host()
static Field rand_host()
{
std::random_device rd;
std::mt19937_64 generator(rd());
@@ -743,7 +742,7 @@ public:
}
template <unsigned REDUCTION_SIZE = 1>
static constexpr HOST_DEVICE_INLINE Field sub_modulus(const Field& xs)
static constexpr Field sub_modulus(const Field& xs)
{
if (REDUCTION_SIZE == 0) return xs;
const ff_storage modulus = get_modulus<REDUCTION_SIZE>();
@@ -764,14 +763,14 @@ public:
return os;
}
friend HOST_DEVICE_INLINE Field operator+(Field xs, const Field& ys)
friend Field operator+(Field xs, const Field& ys)
{
Field rs = {};
add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
return sub_modulus<1>(rs);
}
friend HOST_DEVICE_INLINE Field operator-(Field xs, const Field& ys)
friend Field operator-(Field xs, const Field& ys)
{
Field rs = {};
uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
@@ -782,7 +781,7 @@ public:
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE Wide mul_wide(const Field& xs, const Field& ys)
static constexpr Wide mul_wide(const Field& xs, const Field& ys)
{
Wide rs = {};
multiply_raw(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
@@ -811,7 +810,7 @@ public:
* will cause only 1 reduction to be performed.
*/
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE Field reduce(const Wide& xs)
static constexpr Field reduce(const Wide& xs)
{
// `xs` is left-shifted by `2 * slack_bits` and higher half is written to `xs_hi`
Field xs_hi = Wide::get_higher_with_slack(xs);
@@ -836,19 +835,19 @@ public:
return r;
}
friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys)
friend Field operator*(const Field& xs, const Field& ys)
{
Wide xy = mul_wide(xs, ys); // full mult
return reduce(xy); // reduce mod p
}
friend HOST_DEVICE_INLINE bool operator==(const Field& xs, const Field& ys)
friend bool operator==(const Field& xs, const Field& ys)
{
#ifdef __CUDA_ARCH__
const uint32_t* x = xs.limbs_storage.limbs;
const uint32_t* y = ys.limbs_storage.limbs;
uint32_t limbs_or = x[0] ^ y[0];
UNROLL
for (unsigned i = 1; i < TLC; i++)
limbs_or |= x[i] ^ y[i];
return limbs_or == 0;
@@ -859,15 +858,15 @@ public:
#endif
}
friend HOST_DEVICE_INLINE bool operator!=(const Field& xs, const Field& ys) { return !(xs == ys); }
friend bool operator!=(const Field& xs, const Field& ys) { return !(xs == ys); }
template <const Field& multiplier>
static HOST_DEVICE_INLINE Field mul_const(const Field& xs)
static Field mul_const(const Field& xs)
{
Field mul = multiplier;
static bool is_u32 = true;
#ifdef __CUDA_ARCH__
UNROLL
#endif
for (unsigned i = 1; i < TLC; i++)
is_u32 &= (mul.limbs_storage.limbs[i] == 0);
@@ -877,13 +876,13 @@ public:
}
template <uint32_t multiplier, class T, unsigned REDUCTION_SIZE = 1>
static constexpr HOST_DEVICE_INLINE T mul_unsigned(const T& xs)
static constexpr T mul_unsigned(const T& xs)
{
T rs = {};
T temp = xs;
bool is_zero = true;
#ifdef __CUDA_ARCH__
UNROLL
#endif
for (unsigned i = 0; i < 32; i++) {
if (multiplier & (1 << i)) {
@@ -897,28 +896,28 @@ public:
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE Wide sqr_wide(const Field& xs)
static constexpr Wide sqr_wide(const Field& xs)
{
// TODO: change to a more efficient squaring
return mul_wide<MODULUS_MULTIPLE>(xs, xs);
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE Field sqr(const Field& xs)
static constexpr Field sqr(const Field& xs)
{
// TODO: change to a more efficient squaring
return xs * xs;
}
static constexpr HOST_DEVICE_INLINE Field to_montgomery(const Field& xs) { return xs * Field{CONFIG::montgomery_r}; }
static constexpr Field to_montgomery(const Field& xs) { return xs * Field{CONFIG::montgomery_r}; }
static constexpr HOST_DEVICE_INLINE Field from_montgomery(const Field& xs)
static constexpr Field from_montgomery(const Field& xs)
{
return xs * Field{CONFIG::montgomery_r_inv};
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE Field neg(const Field& xs)
static constexpr Field neg(const Field& xs)
{
const ff_storage modulus = get_modulus<MODULUS_MULTIPLE>();
Field rs = {};
@@ -928,14 +927,14 @@ public:
// Assumes the number is even!
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE Field div2(const Field& xs)
static constexpr Field div2(const Field& xs)
{
const uint32_t* x = xs.limbs_storage.limbs;
Field rs = {};
uint32_t* r = rs.limbs_storage.limbs;
if constexpr (TLC > 1) {
#ifdef __CUDA_ARCH__
UNROLL
#endif
for (unsigned i = 0; i < TLC - 1; i++) {
#ifdef __CUDA_ARCH__
@@ -949,18 +948,18 @@ public:
return sub_modulus<MODULUS_MULTIPLE>(rs);
}
static constexpr HOST_DEVICE_INLINE bool lt(const Field& xs, const Field& ys)
static constexpr bool lt(const Field& xs, const Field& ys)
{
ff_storage dummy = {};
uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, dummy);
return carry;
}
static constexpr HOST_DEVICE_INLINE bool is_odd(const Field& xs) { return xs.limbs_storage.limbs[0] & 1; }
static constexpr bool is_odd(const Field& xs) { return xs.limbs_storage.limbs[0] & 1; }
static constexpr HOST_DEVICE_INLINE bool is_even(const Field& xs) { return ~xs.limbs_storage.limbs[0] & 1; }
static constexpr bool is_even(const Field& xs) { return ~xs.limbs_storage.limbs[0] & 1; }
static constexpr HOST_DEVICE_INLINE Field inverse(const Field& xs)
static constexpr Field inverse(const Field& xs)
{
if (xs == zero()) return zero();
constexpr Field one = Field{CONFIG::one};
@@ -1007,9 +1006,9 @@ struct std::hash<Field<CONFIG>> {
template <class CONFIG>
struct SharedMemory<Field<CONFIG>> {
__device__ Field<CONFIG>* getPointer()
Field<CONFIG>* getPointer()
{
extern __shared__ Field<CONFIG> s_scalar_[];
Field<CONFIG> *s_scalar_;
return s_scalar_;
}
};

View File

@@ -2,8 +2,8 @@
#ifndef FIELD_CONFIG_H
#define FIELD_CONFIG_H
#include "fields/id.h"
#include "fields/field.cuh"
#include "id.h"
#include "field.cuh"
/**
* @namespace field_config
@@ -11,27 +11,27 @@
* with the `-DFIELD` env variable passed during build.
*/
#if FIELD_ID == BN254
#include "fields/snark_fields/bn254_scalar.cuh"
#include "snark_fields/bn254_scalar.cuh"
namespace field_config = bn254;
#elif FIELD_ID == BLS12_381
#include "fields/snark_fields/bls12_381_scalar.cuh"
#include "snark_fields/bls12_381_scalar.cuh"
using bls12_381::fp_config;
namespace field_config = bls12_381;
#elif FIELD_ID == BLS12_377
#include "fields/snark_fields/bls12_377_scalar.cuh"
#include "snark_fields/bls12_377_scalar.cuh"
namespace field_config = bls12_377;
#elif FIELD_ID == BW6_761
#include "fields/snark_fields/bw6_761_scalar.cuh"
#include "snark_fields/bw6_761_scalar.cuh"
namespace field_config = bw6_761;
#elif FIELD_ID == GRUMPKIN
#include "fields/snark_fields/grumpkin_scalar.cuh"
#include "snark_fields/grumpkin_scalar.cuh"
namespace field_config = grumpkin;
#elif FIELD_ID == BABY_BEAR
#include "fields/stark_fields/babybear.cuh"
#include "stark_fields/babybear.cuh"
namespace field_config = babybear;
#elif FIELD_ID == STARK_252
#include "fields/stark_fields/stark252.cuh"
#include "stark_fields/stark252.cuh"
namespace field_config = stark252;
#endif

View File

@@ -3,98 +3,97 @@
#define HOST_MATH_H
#include <cstdint>
#include <cuda_runtime.h>
#include "gpu-utils/modifiers.cuh"
#include "../gpu-utils/modifiers.cuh"
namespace host_math {
// return x + y with uint32_t operands
static __host__ uint32_t add(const uint32_t x, const uint32_t y) { return x + y; }
// return x + y with uint32_t operands
static uint32_t add(const uint32_t x, const uint32_t y) { return x + y; }
// return x + y + carry with uint32_t operands
static __host__ uint32_t addc(const uint32_t x, const uint32_t y, const uint32_t carry) { return x + y + carry; }
// return x + y + carry with uint32_t operands
static uint32_t addc(const uint32_t x, const uint32_t y, const uint32_t carry) { return x + y + carry; }
// return x + y and carry out with uint32_t operands
static __host__ uint32_t add_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
// return x + y and carry out with uint32_t operands
static uint32_t add_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
{
uint32_t result;
result = x + y;
carry = x > result;
return result;
}
// return x + y + carry and carry out with uint32_t operands
static uint32_t addc_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
{
const uint32_t result = x + y + carry;
carry = carry && x >= result || !carry && x > result;
return result;
}
// return x - y with uint32_t operands
static uint32_t sub(const uint32_t x, const uint32_t y) { return x - y; }
// return x - y - borrow with uint32_t operands
static uint32_t subc(const uint32_t x, const uint32_t y, const uint32_t borrow) { return x - y - borrow; }
// return x - y and borrow out with uint32_t operands
static uint32_t sub_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
{
uint32_t result;
result = x - y;
borrow = x < result;
return result;
}
// return x - y - borrow and borrow out with uint32_t operands
static uint32_t subc_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
{
const uint32_t result = x - y - borrow;
borrow = borrow && x <= result || !borrow && x < result;
return result;
}
// return x * y + z + carry and carry out with uint32_t operands
static uint32_t madc_cc(const uint32_t x, const uint32_t y, const uint32_t z, uint32_t& carry)
{
uint32_t result;
uint64_t r = static_cast<uint64_t>(x) * y + z + carry;
carry = (uint32_t)(r >> 32);
result = r & 0xffffffff;
return result;
}
template <unsigned OPS_COUNT = UINT32_MAX, bool CARRY_IN = false, bool CARRY_OUT = false>
struct carry_chain {
unsigned index;
constexpr carry_chain() : index(0) {}
uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
{
uint32_t result;
result = x + y;
carry = x > result;
return result;
index++;
if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
return host_math::add(x, y);
else if (index == 1 && !CARRY_IN)
return host_math::add_cc(x, y, carry);
else if (index < OPS_COUNT || CARRY_OUT)
return host_math::addc_cc(x, y, carry);
else
return host_math::addc(x, y, carry);
}
// return x + y + carry and carry out with uint32_t operands
static __host__ uint32_t addc_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
{
const uint32_t result = x + y + carry;
carry = carry && x >= result || !carry && x > result;
return result;
index++;
if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
return host_math::sub(x, y);
else if (index == 1 && !CARRY_IN)
return host_math::sub_cc(x, y, carry);
else if (index < OPS_COUNT || CARRY_OUT)
return host_math::subc_cc(x, y, carry);
else
return host_math::subc(x, y, carry);
}
// return x - y with uint32_t operands
static __host__ uint32_t sub(const uint32_t x, const uint32_t y) { return x - y; }
// return x - y - borrow with uint32_t operands
static __host__ uint32_t subc(const uint32_t x, const uint32_t y, const uint32_t borrow) { return x - y - borrow; }
// return x - y and borrow out with uint32_t operands
static __host__ uint32_t sub_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
{
uint32_t result;
result = x - y;
borrow = x < result;
return result;
}
// return x - y - borrow and borrow out with uint32_t operands
static __host__ uint32_t subc_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
{
const uint32_t result = x - y - borrow;
borrow = borrow && x <= result || !borrow && x < result;
return result;
}
// return x * y + z + carry and carry out with uint32_t operands
static __host__ uint32_t madc_cc(const uint32_t x, const uint32_t y, const uint32_t z, uint32_t& carry)
{
uint32_t result;
uint64_t r = static_cast<uint64_t>(x) * y + z + carry;
carry = (uint32_t)(r >> 32);
result = r & 0xffffffff;
return result;
}
template <unsigned OPS_COUNT = UINT32_MAX, bool CARRY_IN = false, bool CARRY_OUT = false>
struct carry_chain {
unsigned index;
constexpr HOST_INLINE carry_chain() : index(0) {}
HOST_INLINE uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
{
index++;
if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
return host_math::add(x, y);
else if (index == 1 && !CARRY_IN)
return host_math::add_cc(x, y, carry);
else if (index < OPS_COUNT || CARRY_OUT)
return host_math::addc_cc(x, y, carry);
else
return host_math::addc(x, y, carry);
}
HOST_INLINE uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
{
index++;
if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
return host_math::sub(x, y);
else if (index == 1 && !CARRY_IN)
return host_math::sub_cc(x, y, carry);
else if (index < OPS_COUNT || CARRY_OUT)
return host_math::subc_cc(x, y, carry);
else
return host_math::subc(x, y, carry);
}
};
};
} // namespace host_math
#endif

View File

@@ -1,139 +1,119 @@
#pragma once
#include <cstdint>
#include <cuda_runtime.h>
namespace ptx {
__device__ __forceinline__ uint32_t add(const uint32_t x, const uint32_t y)
uint32_t add(const uint32_t x, const uint32_t y)
{
uint32_t result;
asm("add.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t add_cc(const uint32_t x, const uint32_t y)
uint32_t add_cc(const uint32_t x, const uint32_t y)
{
uint32_t result;
asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t addc(const uint32_t x, const uint32_t y)
uint32_t addc(const uint32_t x, const uint32_t y)
{
uint32_t result;
asm volatile("addc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t addc_cc(const uint32_t x, const uint32_t y)
uint32_t addc_cc(const uint32_t x, const uint32_t y)
{
uint32_t result;
asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t sub(const uint32_t x, const uint32_t y)
uint32_t sub(const uint32_t x, const uint32_t y)
{
uint32_t result;
asm("sub.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t sub_cc(const uint32_t x, const uint32_t y)
uint32_t sub_cc(const uint32_t x, const uint32_t y)
{
uint32_t result;
asm volatile("sub.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t subc(const uint32_t x, const uint32_t y)
uint32_t subc(const uint32_t x, const uint32_t y)
{
uint32_t result;
asm volatile("subc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t subc_cc(const uint32_t x, const uint32_t y)
uint32_t subc_cc(const uint32_t x, const uint32_t y)
{
uint32_t result;
asm volatile("subc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t mul_lo(const uint32_t x, const uint32_t y)
uint32_t mul_lo(const uint32_t x, const uint32_t y)
{
uint32_t result;
asm("mul.lo.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t mul_hi(const uint32_t x, const uint32_t y)
uint32_t mul_hi(const uint32_t x, const uint32_t y)
{
uint32_t result;
asm("mul.hi.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t mad_lo(const uint32_t x, const uint32_t y, const uint32_t z)
uint32_t mad_lo(const uint32_t x, const uint32_t y, const uint32_t z)
{
uint32_t result;
asm("mad.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t mad_hi(const uint32_t x, const uint32_t y, const uint32_t z)
uint32_t mad_hi(const uint32_t x, const uint32_t y, const uint32_t z)
{
uint32_t result;
asm("mad.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t mad_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
uint32_t mad_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
{
uint32_t result;
asm volatile("mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t mad_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
uint32_t mad_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
{
uint32_t result;
asm volatile("mad.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t madc_lo(const uint32_t x, const uint32_t y, const uint32_t z)
uint32_t madc_lo(const uint32_t x, const uint32_t y, const uint32_t z)
{
uint32_t result;
asm volatile("madc.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t madc_hi(const uint32_t x, const uint32_t y, const uint32_t z)
uint32_t madc_hi(const uint32_t x, const uint32_t y, const uint32_t z)
{
uint32_t result;
asm volatile("madc.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t madc_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
uint32_t madc_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
{
uint32_t result;
asm volatile("madc.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint32_t madc_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
uint32_t madc_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
{
uint32_t result;
asm volatile("madc.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
uint32_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t mov_b64(uint32_t lo, uint32_t hi)
uint64_t mov_b64(uint32_t lo, uint32_t hi)
{
uint64_t result;
asm("mov.b64 %0, {%1,%2};" : "=l"(result) : "r"(lo), "r"(hi));
uint64_t result = 0;
return result;
}
@@ -141,142 +121,124 @@ namespace ptx {
// Callers should know exactly what they're calling (no implicit conversions).
namespace u64 {
__device__ __forceinline__ uint64_t add(const uint64_t x, const uint64_t y)
uint64_t add(const uint64_t x, const uint64_t y)
{
uint64_t result;
asm("add.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t add_cc(const uint64_t x, const uint64_t y)
uint64_t add_cc(const uint64_t x, const uint64_t y)
{
uint64_t result;
asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t addc(const uint64_t x, const uint64_t y)
uint64_t addc(const uint64_t x, const uint64_t y)
{
uint64_t result;
asm volatile("addc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t addc_cc(const uint64_t x, const uint64_t y)
uint64_t addc_cc(const uint64_t x, const uint64_t y)
{
uint64_t result;
asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t sub(const uint64_t x, const uint64_t y)
uint64_t sub(const uint64_t x, const uint64_t y)
{
uint64_t result;
asm("sub.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t sub_cc(const uint64_t x, const uint64_t y)
uint64_t sub_cc(const uint64_t x, const uint64_t y)
{
uint64_t result;
asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t subc(const uint64_t x, const uint64_t y)
uint64_t subc(const uint64_t x, const uint64_t y)
{
uint64_t result;
asm volatile("subc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t subc_cc(const uint64_t x, const uint64_t y)
uint64_t subc_cc(const uint64_t x, const uint64_t y)
{
uint64_t result;
asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t mul_lo(const uint64_t x, const uint64_t y)
uint64_t mul_lo(const uint64_t x, const uint64_t y)
{
uint64_t result;
asm("mul.lo.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t mul_hi(const uint64_t x, const uint64_t y)
uint64_t mul_hi(const uint64_t x, const uint64_t y)
{
uint64_t result;
asm("mul.hi.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t mad_lo(const uint64_t x, const uint64_t y, const uint64_t z)
uint64_t mad_lo(const uint64_t x, const uint64_t y, const uint64_t z)
{
uint64_t result;
asm("mad.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t mad_hi(const uint64_t x, const uint64_t y, const uint64_t z)
uint64_t mad_hi(const uint64_t x, const uint64_t y, const uint64_t z)
{
uint64_t result;
asm("mad.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t mad_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
uint64_t mad_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
{
uint64_t result;
asm volatile("mad.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t mad_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
uint64_t mad_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
{
uint64_t result;
asm volatile("mad.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t madc_lo(const uint64_t x, const uint64_t y, const uint64_t z)
uint64_t madc_lo(const uint64_t x, const uint64_t y, const uint64_t z)
{
uint64_t result;
asm volatile("madc.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t madc_hi(const uint64_t x, const uint64_t y, const uint64_t z)
uint64_t madc_hi(const uint64_t x, const uint64_t y, const uint64_t z)
{
uint64_t result;
asm volatile("madc.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t madc_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
uint64_t madc_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
{
uint64_t result;
asm volatile("madc.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
uint64_t result = 0;
return result;
}
__device__ __forceinline__ uint64_t madc_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
uint64_t madc_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
{
uint64_t result;
asm volatile("madc.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
uint64_t result = 0;
return result;
}
} // namespace u64
__device__ __forceinline__ void bar_arrive(const unsigned name, const unsigned count)
void bar_arrive(const unsigned name, const unsigned count)
{
asm volatile("bar.arrive %0, %1;" : : "r"(name), "r"(count) : "memory");
return;
}
__device__ __forceinline__ void bar_sync(const unsigned name, const unsigned count)
void bar_sync(const unsigned name, const unsigned count)
{
asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(count) : "memory");
return;
}
} // namespace ptx

View File

@@ -1,8 +1,8 @@
#pragma once
#include "field.cuh"
#include "gpu-utils/modifiers.cuh"
#include "gpu-utils/sharedmem.cuh"
#include "../gpu-utils/modifiers.cuh"
#include "../gpu-utils/sharedmem.cuh"
template <typename CONFIG>
class ExtensionField
@@ -16,12 +16,12 @@ private:
FWide real;
FWide imaginary;
friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
friend ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
{
return ExtensionWide{xs.real + ys.real, xs.imaginary + ys.imaginary};
}
friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
friend ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
{
return ExtensionWide{xs.real - ys.real, xs.imaginary - ys.imaginary};
}
@@ -34,21 +34,21 @@ public:
FF real;
FF imaginary;
static constexpr HOST_DEVICE_INLINE ExtensionField zero() { return ExtensionField{FF::zero(), FF::zero()}; }
static constexpr ExtensionField zero() { return ExtensionField{FF::zero(), FF::zero()}; }
static constexpr HOST_DEVICE_INLINE ExtensionField one() { return ExtensionField{FF::one(), FF::zero()}; }
static constexpr ExtensionField one() { return ExtensionField{FF::one(), FF::zero()}; }
static constexpr HOST_DEVICE_INLINE ExtensionField to_montgomery(const ExtensionField& xs)
static constexpr ExtensionField to_montgomery(const ExtensionField& xs)
{
return ExtensionField{xs.real * FF{CONFIG::montgomery_r}, xs.imaginary * FF{CONFIG::montgomery_r}};
}
static constexpr HOST_DEVICE_INLINE ExtensionField from_montgomery(const ExtensionField& xs)
static constexpr ExtensionField from_montgomery(const ExtensionField& xs)
{
return ExtensionField{xs.real * FF{CONFIG::montgomery_r_inv}, xs.imaginary * FF{CONFIG::montgomery_r_inv}};
}
static HOST_INLINE ExtensionField rand_host() { return ExtensionField{FF::rand_host(), FF::rand_host()}; }
static ExtensionField rand_host() { return ExtensionField{FF::rand_host(), FF::rand_host()}; }
static void rand_host_many(ExtensionField* out, int size)
{
@@ -57,7 +57,7 @@ public:
}
template <unsigned REDUCTION_SIZE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs)
static constexpr ExtensionField sub_modulus(const ExtensionField& xs)
{
return ExtensionField{FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.imaginary)};
}
@@ -68,38 +68,38 @@ public:
return os;
}
friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
friend ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
{
return ExtensionField{xs.real + ys.real, xs.imaginary + ys.imaginary};
}
friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
friend ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
{
return ExtensionField{xs.real - ys.real, xs.imaginary - ys.imaginary};
}
friend HOST_DEVICE_INLINE ExtensionField operator+(FF xs, const ExtensionField& ys)
friend ExtensionField operator+(FF xs, const ExtensionField& ys)
{
return ExtensionField{xs + ys.real, ys.imaginary};
}
friend HOST_DEVICE_INLINE ExtensionField operator-(FF xs, const ExtensionField& ys)
friend ExtensionField operator-(FF xs, const ExtensionField& ys)
{
return ExtensionField{xs - ys.real, FF::neg(ys.imaginary)};
}
friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const FF& ys)
friend ExtensionField operator+(ExtensionField xs, const FF& ys)
{
return ExtensionField{xs.real + ys, xs.imaginary};
}
friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const FF& ys)
friend ExtensionField operator-(ExtensionField xs, const FF& ys)
{
return ExtensionField{xs.real - ys, xs.imaginary};
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
{
FWide real_prod = FF::mul_wide(xs.real, ys.real);
FWide imaginary_prod = FF::mul_wide(xs.imaginary, ys.imaginary);
@@ -110,40 +110,40 @@ public:
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
{
return ExtensionWide{FF::mul_wide(xs.real, ys), FF::mul_wide(xs.imaginary, ys)};
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
static constexpr ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
{
return mul_wide(ys, xs);
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs)
static constexpr ExtensionField reduce(const ExtensionWide& xs)
{
return ExtensionField{
FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.imaginary)};
}
template <class T1, class T2>
friend HOST_DEVICE_INLINE ExtensionField operator*(const T1& xs, const T2& ys)
friend ExtensionField operator*(const T1& xs, const T2& ys)
{
ExtensionWide xy = mul_wide(xs, ys);
return reduce(xy);
}
friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys)
friend bool operator==(const ExtensionField& xs, const ExtensionField& ys)
{
return (xs.real == ys.real) && (xs.imaginary == ys.imaginary);
}
friend HOST_DEVICE_INLINE bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
friend bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
template <const ExtensionField& multiplier>
static HOST_DEVICE_INLINE ExtensionField mul_const(const ExtensionField& xs)
static ExtensionField mul_const(const ExtensionField& xs)
{
static constexpr FF mul_real = multiplier.real;
static constexpr FF mul_imaginary = multiplier.imaginary;
@@ -159,33 +159,33 @@ public:
}
template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs)
static constexpr ExtensionField mul_unsigned(const ExtensionField& xs)
{
return {FF::template mul_unsigned<multiplier>(xs.real), FF::template mul_unsigned<multiplier>(xs.imaginary)};
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionWide sqr_wide(const ExtensionField& xs)
static constexpr ExtensionWide sqr_wide(const ExtensionField& xs)
{
// TODO: change to a more efficient squaring
return mul_wide<MODULUS_MULTIPLE>(xs, xs);
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionField sqr(const ExtensionField& xs)
static constexpr ExtensionField sqr(const ExtensionField& xs)
{
// TODO: change to a more efficient squaring
return xs * xs;
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionField neg(const ExtensionField& xs)
static constexpr ExtensionField neg(const ExtensionField& xs)
{
return ExtensionField{FF::neg(xs.real), FF::neg(xs.imaginary)};
}
// inverse of zero is set to be zero which is what we want most of the time
static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs)
static constexpr ExtensionField inverse(const ExtensionField& xs)
{
ExtensionField xs_conjugate = {xs.real, FF::neg(xs.imaginary)};
FF nonresidue_times_im = FF::template mul_unsigned<CONFIG::nonresidue>(FF::sqr(xs.imaginary));
@@ -198,9 +198,9 @@ public:
template <class CONFIG>
struct SharedMemory<ExtensionField<CONFIG>> {
__device__ ExtensionField<CONFIG>* getPointer()
ExtensionField<CONFIG>* getPointer()
{
extern __shared__ ExtensionField<CONFIG> s_ext2_scalar_[];
ExtensionField<CONFIG> *s_ext2_scalar_;
return s_ext2_scalar_;
}
};

View File

@@ -1,8 +1,8 @@
#pragma once
#include "field.cuh"
#include "gpu-utils/modifiers.cuh"
#include "gpu-utils/sharedmem.cuh"
#include "../gpu-utils/modifiers.cuh"
#include "../gpu-utils/sharedmem.cuh"
template <typename CONFIG>
class ExtensionField
@@ -16,12 +16,12 @@ private:
FWide im2;
FWide im3;
friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
friend ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
{
return ExtensionWide{xs.real + ys.real, xs.im1 + ys.im1, xs.im2 + ys.im2, xs.im3 + ys.im3};
}
friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
friend ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
{
return ExtensionWide{xs.real - ys.real, xs.im1 - ys.im1, xs.im2 - ys.im2, xs.im3 - ys.im3};
}
@@ -36,31 +36,31 @@ public:
FF im2;
FF im3;
static constexpr HOST_DEVICE_INLINE ExtensionField zero()
static constexpr ExtensionField zero()
{
return ExtensionField{FF::zero(), FF::zero(), FF::zero(), FF::zero()};
}
static constexpr HOST_DEVICE_INLINE ExtensionField one()
static constexpr ExtensionField one()
{
return ExtensionField{FF::one(), FF::zero(), FF::zero(), FF::zero()};
}
static constexpr HOST_DEVICE_INLINE ExtensionField to_montgomery(const ExtensionField& xs)
static constexpr ExtensionField to_montgomery(const ExtensionField& xs)
{
return ExtensionField{
xs.real * FF{CONFIG::montgomery_r}, xs.im1 * FF{CONFIG::montgomery_r}, xs.im2 * FF{CONFIG::montgomery_r},
xs.im3 * FF{CONFIG::montgomery_r}};
}
static constexpr HOST_DEVICE_INLINE ExtensionField from_montgomery(const ExtensionField& xs)
static constexpr ExtensionField from_montgomery(const ExtensionField& xs)
{
return ExtensionField{
xs.real * FF{CONFIG::montgomery_r_inv}, xs.im1 * FF{CONFIG::montgomery_r_inv},
xs.im2 * FF{CONFIG::montgomery_r_inv}, xs.im3 * FF{CONFIG::montgomery_r_inv}};
}
static HOST_INLINE ExtensionField rand_host()
static ExtensionField rand_host()
{
return ExtensionField{FF::rand_host(), FF::rand_host(), FF::rand_host(), FF::rand_host()};
}
@@ -72,7 +72,7 @@ public:
}
template <unsigned REDUCTION_SIZE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs)
static constexpr ExtensionField sub_modulus(const ExtensionField& xs)
{
return ExtensionField{
FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.im1),
@@ -86,38 +86,38 @@ public:
return os;
}
friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
friend ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
{
return ExtensionField{xs.real + ys.real, xs.im1 + ys.im1, xs.im2 + ys.im2, xs.im3 + ys.im3};
}
friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
friend ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
{
return ExtensionField{xs.real - ys.real, xs.im1 - ys.im1, xs.im2 - ys.im2, xs.im3 - ys.im3};
}
friend HOST_DEVICE_INLINE ExtensionField operator+(FF xs, const ExtensionField& ys)
friend ExtensionField operator+(FF xs, const ExtensionField& ys)
{
return ExtensionField{xs + ys.real, ys.im1, ys.im2, ys.im3};
}
friend HOST_DEVICE_INLINE ExtensionField operator-(FF xs, const ExtensionField& ys)
friend ExtensionField operator-(FF xs, const ExtensionField& ys)
{
return ExtensionField{xs - ys.real, FF::neg(ys.im1), FF::neg(ys.im2), FF::neg(ys.im3)};
}
friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const FF& ys)
friend ExtensionField operator+(ExtensionField xs, const FF& ys)
{
return ExtensionField{xs.real + ys, xs.im1, xs.im2, xs.im3};
}
friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const FF& ys)
friend ExtensionField operator-(ExtensionField xs, const FF& ys)
{
return ExtensionField{xs.real - ys, xs.im1, xs.im2, xs.im3};
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
{
if (CONFIG::nonresidue_is_negative)
return ExtensionWide{
@@ -144,21 +144,21 @@ public:
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
{
return ExtensionWide{
FF::mul_wide(xs.real, ys), FF::mul_wide(xs.im1, ys), FF::mul_wide(xs.im2, ys), FF::mul_wide(xs.im3, ys)};
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
static constexpr ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
{
return ExtensionWide{
FF::mul_wide(xs, ys.real), FF::mul_wide(xs, ys.im1), FF::mul_wide(xs, ys.im2), FF::mul_wide(xs, ys.im3)};
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs)
static constexpr ExtensionField reduce(const ExtensionWide& xs)
{
return ExtensionField{
FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.im1),
@@ -166,21 +166,21 @@ public:
}
template <class T1, class T2>
friend HOST_DEVICE_INLINE ExtensionField operator*(const T1& xs, const T2& ys)
friend ExtensionField operator*(const T1& xs, const T2& ys)
{
ExtensionWide xy = mul_wide(xs, ys);
return reduce(xy);
}
friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys)
friend bool operator==(const ExtensionField& xs, const ExtensionField& ys)
{
return (xs.real == ys.real) && (xs.im1 == ys.im1) && (xs.im2 == ys.im2) && (xs.im3 == ys.im3);
}
friend HOST_DEVICE_INLINE bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
friend bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs)
static constexpr ExtensionField mul_unsigned(const ExtensionField& xs)
{
return {
FF::template mul_unsigned<multiplier>(xs.real), FF::template mul_unsigned<multiplier>(xs.im1),
@@ -188,27 +188,27 @@ public:
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionWide sqr_wide(const ExtensionField& xs)
static constexpr ExtensionWide sqr_wide(const ExtensionField& xs)
{
// TODO: change to a more efficient squaring
return mul_wide<MODULUS_MULTIPLE>(xs, xs);
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionField sqr(const ExtensionField& xs)
static constexpr ExtensionField sqr(const ExtensionField& xs)
{
// TODO: change to a more efficient squaring
return xs * xs;
}
template <unsigned MODULUS_MULTIPLE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionField neg(const ExtensionField& xs)
static constexpr ExtensionField neg(const ExtensionField& xs)
{
return {FF::neg(xs.real), FF::neg(xs.im1), FF::neg(xs.im2), FF::neg(xs.im3)};
}
// inverse of zero is set to be zero which is what we want most of the time
static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs)
static constexpr ExtensionField inverse(const ExtensionField& xs)
{
FF x, x0, x2;
if (CONFIG::nonresidue_is_negative) {
@@ -249,9 +249,9 @@ public:
template <class CONFIG>
struct SharedMemory<ExtensionField<CONFIG>> {
__device__ ExtensionField<CONFIG>* getPointer()
ExtensionField<CONFIG>* getPointer()
{
extern __shared__ ExtensionField<CONFIG> s_ext4_scalar_[];
ExtensionField<CONFIG> *s_ext4_scalar_=nullptr;
return s_ext4_scalar_;
}
};

View File

@@ -2,7 +2,7 @@
#ifndef BN254_BASE_PARAMS_H
#define BN254_BASE_PARAMS_H
#include "fields/storage.cuh"
#include "../storage.cuh"
namespace bn254 {
struct fq_config {

View File

@@ -2,9 +2,9 @@
#ifndef BN254_SCALAR_PARAMS_H
#define BN254_SCALAR_PARAMS_H
#include "fields/storage.cuh"
#include "fields/field.cuh"
#include "fields/quadratic_extension.cuh"
#include "../storage.cuh"
#include "../field.cuh"
#include "../quadratic_extension.cuh"
namespace bn254 {
struct fp_config {

View File

@@ -1,8 +1,8 @@
#pragma once
#include "fields/storage.cuh"
#include "fields/field.cuh"
#include "fields/quartic_extension.cuh"
#include "../storage.cuh"
#include "../field.cuh"
#include "../quartic_extension.cuh"
namespace babybear {
struct fp_config {

View File

@@ -1,20 +1,20 @@
#pragma once
#ifndef DEVICE_CONTEXT_H
#define DEVICE_CONTEXT_H
#include <cuda_runtime.h>
#include <cstddef>
namespace device_context {
constexpr std::size_t MAX_DEVICES = 32;
size_t MAX_DEVICES = 32;
/**
* Properties of the device used in icicle functions.
*/
struct DeviceContext {
cudaStream_t& stream; /**< Stream to use. Default value: 0. */
int stream; /**< Stream to use. Default value: 0. */
std::size_t device_id; /**< Index of the currently used GPU. Default value: 0. */
cudaMemPool_t mempool; /**< Mempool to use. Default value: 0. */
int mempool; /**< Mempool to use. Default value: 0. */
};
/**
@@ -22,9 +22,9 @@ namespace device_context {
*/
inline DeviceContext get_default_device_context() // TODO: naming convention ?
{
static cudaStream_t default_stream = (cudaStream_t)0;
static int default_stream = 0;
return DeviceContext{
(cudaStream_t&)default_stream, // stream
default_stream, // stream
0, // device_id
0, // mempool
};

View File

@@ -3,12 +3,10 @@
#define ERR_H
#include <iostream>
#include <cuda_runtime.h>
#include <stdexcept>
#include <string>
enum class IcicleError_t {
enum IcicleError_t {
IcicleSuccess = 0,
InvalidArgument = 1,
MemoryAllocationError = 2,
@@ -38,14 +36,14 @@ private:
public:
// Constructor for cudaError_t with optional message
IcicleError(cudaError_t cudaError, const std::string& msg = "")
: std::runtime_error("CUDA Error: " + std::string(cudaGetErrorString(cudaError)) + " " + msg),
IcicleError(int cudaError, const std::string& msg = "")
: std::runtime_error("Error: " + msg),
errCode(static_cast<int>(cudaError))
{
}
// Constructor for cudaError_t with const char* message
IcicleError(cudaError_t cudaError, const char* msg) : IcicleError(cudaError, std::string(msg)) {}
IcicleError(int cudaError, const char* msg) : IcicleError(cudaError, std::string(msg)) {}
// Constructor for IcicleError_t with optional message
IcicleError(IcicleError_t icicleError, const std::string& msg = "")
@@ -67,11 +65,10 @@ public:
#define CHK_LOG(val) check((val), #val, __FILE__, __LINE__)
#define CHK_VAL(val, file, line) check((val), #val, file, line)
cudaError_t inline check(cudaError_t err, const char* const func, const char* const file, const int line)
int inline check(int err, const char* const func, const char* const file, const int line)
{
if (err != cudaSuccess) {
if (err != 0) {
std::cerr << "CUDA Runtime Error by: " << func << " at: " << file << ":" << line << std::endl;
std::cerr << cudaGetErrorString(err) << std::endl << std::endl;
}
return err;
@@ -90,12 +87,12 @@ cudaError_t inline check(cudaError_t err, const char* const func, const char* co
#define THROW_ICICLE_CUDA(val) throwIcicleCudaErr(val, __FUNCTION__, __FILE__, __LINE__)
#define THROW_ICICLE_CUDA_ERR(val, func, file, line) throwIcicleCudaErr(val, func, file, line)
void inline throwIcicleCudaErr(
cudaError_t err, const char* const func, const char* const file, const int line, bool isUnrecoverable = true)
int err, const char* const func, const char* const file, const int line, bool isUnrecoverable = true)
{
// TODO: fmt::format introduced only in C++20
std::string err_msg = (isUnrecoverable ? "!!!Unrecoverable!!! : " : "") + std::string{cudaGetErrorString(err)} +
" : detected by: " + func + " at: " + file + ":" + std::to_string(line) +
"\nThe error is reported there and may be caused by prior calls.\n";
std::string err_msg = (isUnrecoverable ? "!!!Unrecoverable!!! : " : "");
// + " : detected by: " + func + " at: " + file + ":" + std::to_string(line) +
// "\nThe error is reported there and may be caused by prior calls.\n";
std::cerr << err_msg << std::endl; // TODO: Logging
throw IcicleError{err, err_msg};
}
@@ -111,14 +108,14 @@ void inline throwIcicleErr(
throw IcicleError{err, err_msg};
}
cudaError_t inline checkCudaErrorIsSticky(
cudaError_t err, const char* const func, const char* const file, const int line, bool isThrowing = true)
int inline checkCudaErrorIsSticky(
int err, const char* const func, const char* const file, const int line, bool isThrowing = true)
{
if (err != cudaSuccess) {
if (err != 0) {
// check for sticky (unrecoverable) error when the only option is to restart process
cudaError_t err2 = cudaDeviceSynchronize();
int err2 = 0;
bool is_logged;
if (err2 != cudaSuccess) { // we suspect sticky error
if (err2 != 0) { // we suspect sticky error
if (err != err2) {
is_logged = true;
CHK_ERR(err, func, file, line);
@@ -139,13 +136,13 @@ cudaError_t inline checkCudaErrorIsSticky(
// most common macros to use
#define CHK_INIT_IF_RETURN() \
{ \
cudaError_t err_result = CHK_LAST(); \
int err_result = CHK_LAST(); \
if (err_result != cudaSuccess) return err_result; \
}
#define CHK_IF_RETURN(val) \
{ \
cudaError_t err_result = CHK_STICKY(val); \
int err_result = CHK_STICKY(val); \
if (err_result != cudaSuccess) return err_result; \
}

View File

@@ -6,6 +6,6 @@
#define UNROLL #pragma unroll
#endif
#define HOST_INLINE __host__ INLINE_MACRO
#define DEVICE_INLINE __device__ INLINE_MACRO
#define HOST_DEVICE_INLINE __host__ __device__ INLINE_MACRO
// #define __host__ INLINE_MACRO
// #define INLINE_MACRO
// #define __host__ INLINE_MACRO

View File

@@ -24,7 +24,7 @@
* definitions.
*
* To use dynamically allocated shared memory in a templatized __global__ or
* __device__ function, just replace code like this:
* function, just replace code like this:
*
* <pre>
* template<class T>
@@ -32,7 +32,7 @@
* foo( T* d_out, T* d_in)
* {
* // Shared mem size is determined by the host app at run time
* extern __shared__ T sdata[];
* T sdata[];
* ...
* doStuff(sdata);
* ...
@@ -62,7 +62,7 @@
*
* This struct uses template specialization on the type \a T to declare
* a differently named dynamic shared memory array for each type
* (\code extern __shared__ T s_type[] \endcode).
* (\code T s_type[] \endcode).
*
* Currently there are specializations for the following types:
* \c int, \c uint, \c char, \c uchar, \c short, \c ushort, \c long,
@@ -73,11 +73,10 @@ template <typename T>
struct SharedMemory {
//! @brief Return a pointer to the runtime-sized shared memory array.
//! @returns Pointer to runtime-sized shared memory array
__device__ T* getPointer()
T* getPointer()
{
// extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
// Error_UnsupportedType();
return (T*)0;
T* a = nullptr; // Initialize pointer to nullptr or allocate memory as needed
return a;
}
// TODO: Use operator overloading to make this class look like a regular array
};
@@ -88,129 +87,128 @@ struct SharedMemory {
template <>
struct SharedMemory<int> {
__device__ int* getPointer()
int* getPointer()
{
extern __shared__ int s_int[];
return s_int;
return 0;
}
};
template <>
struct SharedMemory<unsigned int> {
__device__ unsigned int* getPointer()
unsigned int* getPointer()
{
extern __shared__ unsigned int s_uint[];
return s_uint;
return 0;
}
};
template <>
struct SharedMemory<char> {
__device__ char* getPointer()
char* getPointer()
{
extern __shared__ char s_char[];
return s_char;
char *a = nullptr;
return a;
}
};
template <>
struct SharedMemory<unsigned char> {
__device__ unsigned char* getPointer()
unsigned char* getPointer()
{
extern __shared__ unsigned char s_uchar[];
return s_uchar;
unsigned char* a = nullptr;
return a;
}
};
template <>
struct SharedMemory<short> {
__device__ short* getPointer()
short* getPointer()
{
extern __shared__ short s_short[];
return s_short;
short* a = nullptr;
return a;
}
};
template <>
struct SharedMemory<unsigned short> {
__device__ unsigned short* getPointer()
unsigned short* getPointer()
{
extern __shared__ unsigned short s_ushort[];
return s_ushort;
unsigned short* a = nullptr;
return a;
}
};
template <>
struct SharedMemory<long> {
__device__ long* getPointer()
long* getPointer()
{
extern __shared__ long s_long[];
long *s_long = nullptr;
return s_long;
}
};
template <>
struct SharedMemory<unsigned long> {
__device__ unsigned long* getPointer()
unsigned long* getPointer()
{
extern __shared__ unsigned long s_ulong[];
unsigned long *s_ulong = nullptr;
return s_ulong;
}
};
template <>
struct SharedMemory<long long> {
__device__ long long* getPointer()
long long* getPointer()
{
extern __shared__ long long s_longlong[];
long long *s_longlong;
return s_longlong;
}
};
template <>
struct SharedMemory<unsigned long long> {
__device__ unsigned long long* getPointer()
unsigned long long* getPointer()
{
extern __shared__ unsigned long long s_ulonglong[];
unsigned long long *s_ulonglong;
return s_ulonglong;
}
};
template <>
struct SharedMemory<bool> {
__device__ bool* getPointer()
bool* getPointer()
{
extern __shared__ bool s_bool[];
bool *s_bool;
return s_bool;
}
};
template <>
struct SharedMemory<float> {
__device__ float* getPointer()
float* getPointer()
{
extern __shared__ float s_float[];
float *s_float;
return s_float;
}
};
template <>
struct SharedMemory<double> {
__device__ double* getPointer()
double* getPointer()
{
extern __shared__ double s_double[];
double *s_double;
return s_double;
}
};
template <>
struct SharedMemory<uchar4> {
__device__ uchar4* getPointer()
{
extern __shared__ uchar4 s_uchar4[];
return s_uchar4;
}
};
// template <>
// struct SharedMemory<uchar4> {
// uchar4* getPointer()
// {
// uchar4 *s_uchar4;
// return s_uchar4;
// }
// };
#endif //_SHAREDMEM_H_

View File

@@ -3,9 +3,9 @@
#define KECCAK_H
#include <cstdint>
#include "gpu-utils/device_context.cuh"
#include "gpu-utils/error_handler.cuh"
#include "../../gpu-utils/device_context.cuh"
#include "../../gpu-utils/error_handler.cuh"
typedef int cudaError_t;
namespace keccak {
/**
* @struct KeccakConfig

View File

@@ -2,13 +2,12 @@
#ifndef MSM_H
#define MSM_H
#include <cuda_runtime.h>
#include "curves/affine.cuh"
#include "curves/projective.cuh"
#include "fields/field.cuh"
#include "gpu-utils/device_context.cuh"
#include "gpu-utils/error_handler.cuh"
#include "../curves/affine.cuh"
#include "../curves/projective.cuh"
#include "../fields/field.cuh"
#include "../gpu-utils/device_context.cuh"
#include "../gpu-utils/error_handler.cuh"
/**
* @namespace msm
@@ -124,6 +123,8 @@ namespace msm {
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*
*/
typedef int cudaError_t;
template <typename S, typename A, typename P>
cudaError_t msm(const S* scalars, const A* points, int msm_size, MSMConfig& config, P* results);

View File

@@ -2,13 +2,11 @@
#ifndef NTT_H
#define NTT_H
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "gpu-utils/error_handler.cuh"
#include "gpu-utils/sharedmem.cuh"
#include "utils/utils_kernels.cuh"
#include "utils/utils.h"
#include "../gpu-utils/device_context.cuh"
#include "../gpu-utils/error_handler.cuh"
#include "../gpu-utils/sharedmem.cuh"
#include "../utils/utils_kernels.cuh"
#include "../utils/utils.h"
/**
* @namespace ntt
@@ -36,6 +34,8 @@ namespace ntt {
* primitive_root).
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
typedef int cudaError_t;
template <typename S>
cudaError_t init_domain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode = false);

View File

@@ -3,8 +3,9 @@
#define _NTT_IMPL_H
#include <stdint.h>
#include "ntt/ntt.cuh" // for enum Ordering
#include "ntt.cuh" // for enum Ordering
typedef int cudaError_t;
typedef int cudaStream_t;
namespace mxntt {
template <typename S>

View File

@@ -1,8 +1,8 @@
#pragma once
#include "gpu-utils/device_context.cuh"
#include "fields/field_config.cuh"
#include "polynomials/polynomials.h"
#include "../../gpu-utils/device_context.cuh"
#include "../../fields/field_config.cuh"
#include "../polynomials.h"
using device_context::DeviceContext;
@@ -11,7 +11,7 @@ namespace polynomials {
class CUDAPolynomialFactory : public AbstractPolynomialFactory<C, D, I>
{
std::vector<DeviceContext> m_device_contexts; // device-id --> device context
std::vector<cudaStream_t> m_device_streams; // device-id --> device stream. Storing the streams here as workaround
std::vector<int> m_device_streams; // device-id --> device stream. Storing the streams here as workaround
// since DeviceContext has a reference to a stream.
public:

View File

@@ -6,7 +6,7 @@
#include <algorithm> // for std::max
#include <cstdint> // for uint64_t, etc.
#include <memory>
#include "utils/integrity_pointer.h"
#include "../utils/integrity_pointer.h"
namespace polynomials {

View File

@@ -2,8 +2,8 @@
#include <iostream>
#include <memory>
#include "utils/integrity_pointer.h"
#include "fields/field_config.cuh"
#include "../utils/integrity_pointer.h"
#include "../fields/field_config.cuh"
#include "polynomial_context.h"
#include "polynomial_backend.h"

View File

@@ -4,9 +4,9 @@
#include <cstdint>
#include <stdexcept>
#include "gpu-utils/device_context.cuh"
#include "gpu-utils/error_handler.cuh"
#include "utils/utils.h"
#include "../gpu-utils/device_context.cuh"
#include "../gpu-utils/error_handler.cuh"
#include "../utils/utils.h"
/**
* @namespace poseidon
@@ -117,6 +117,7 @@ namespace poseidon {
/**
* Loads pre-calculated optimized constants, moves them to the device
*/
typedef int cudaError_t;
template <typename S>
cudaError_t
init_optimized_poseidon_constants(int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* constants);

View File

@@ -9,7 +9,7 @@ namespace mont {
#define MAX_THREADS_PER_BLOCK 256
template <typename E, bool is_into>
__global__ void MontgomeryKernel(const E* input, int n, E* output)
void MontgomeryKernel(const E* input, int n, E* output)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) { output[tid] = is_into ? E::to_montgomery(input[tid]) : E::from_montgomery(input[tid]); }

View File

@@ -4,14 +4,13 @@
namespace utils_internal {
template <typename E, typename S>
__global__ void NormalizeKernel(E* arr, S scalar, int n)
void NormalizeKernel(E* arr, S scalar, int n)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) { arr[tid] = scalar * arr[tid]; }
return;
}
template <typename E, typename S>
__global__ void BatchMulKernel(
void BatchMulKernel(
const E* in_vec,
int n_elements,
int batch_size,
@@ -22,12 +21,7 @@ namespace utils_internal {
bool bitrev,
E* out_vec)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n_elements * batch_size) {
int64_t scalar_id = tid % n_elements;
if (bitrev) scalar_id = __brev(scalar_id) >> (32 - logn);
out_vec[tid] = *(scalar_vec + ((scalar_id * step) % n_scalars)) * in_vec[tid];
}
return;
}
} // namespace utils_internal

View File

@@ -2,7 +2,7 @@
#ifndef LDE_H
#define LDE_H
#include "gpu-utils/device_context.cuh"
#include "../gpu-utils/device_context.cuh"
/**
* @namespace vec_ops
@@ -57,6 +57,7 @@ namespace vec_ops {
* @tparam E The type of elements `vec_b` and `result`. Often (but not always) `E=S`.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
typedef int cudaError_t;
template <typename E, typename S>
cudaError_t Mul(const S* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result);

View File

@@ -1,29 +1,26 @@
if (G2)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2")
endif ()
set(TARGET icicle_curve)
set(FIELD_TARGET icicle_field)
set(SRC ${CMAKE_SOURCE_DIR}/src)
set(SRC ../../)
set(CURVE_SOURCE ${SRC}/curves/extern.cu)
set(CURVE_SOURCE ${SRC}/curves/extern.cpp)
if(G2)
list(APPEND CURVE_SOURCE ${SRC}/curves/extern_g2.cu)
endif()
if(MSM)
list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cu)
if(G2)
list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cu)
endif()
endif()
if(ECNTT)
list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cu)
list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cu)
list(APPEND CURVE_SOURCE ${SRC}/curves/extern_g2.cpp)
endif()
# if(MSM)
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
# if(G2)
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
# endif()
# endif()
# if(ECNTT)
# list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
# list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
# endif()
add_library(${TARGET} STATIC ${CURVE_SOURCE})
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
target_compile_features(${TARGET} PUBLIC cxx_std_17)

View File

@@ -0,0 +1,40 @@
#define CURVE_ID BN254
#include "../../include/curves/curve_config.cuh"
using namespace curve_config;
#include "../../include/gpu-utils/device_context.cuh"
#include "../../include/utils/utils.h"
// #include "../utils/mont.cuh"
extern "C" bool CONCAT_EXPAND(CURVE, eq)(projective_t* point1, projective_t* point2)
{
return true;
}
extern "C" void CONCAT_EXPAND(CURVE, to_affine)(projective_t* point, affine_t* point_out)
{
return;
}
extern "C" void CONCAT_EXPAND(CURVE, generate_projective_points)(projective_t* points, int size)
{
return;
}
extern "C" void CONCAT_EXPAND(CURVE, generate_affine_points)(affine_t* points, int size)
{
return;
}
extern "C" int CONCAT_EXPAND(CURVE, affine_convert_montgomery)(
affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
{
return 0;
}
extern "C" int CONCAT_EXPAND(CURVE, projective_convert_montgomery)(
projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
{
return 0;
}

View File

@@ -0,0 +1,39 @@
#include "curves/curve_config.cuh"
using namespace curve_config;
#include "gpu-utils/device_context.cuh"
#include "utils/utils.h"
#include "utils/mont.cuh"
extern "C" bool CONCAT_EXPAND(CURVE, g2_eq)(g2_projective_t* point1, g2_projective_t* point2)
{
return true;
}
extern "C" void CONCAT_EXPAND(CURVE, g2_to_affine)(g2_projective_t* point, g2_affine_t* point_out)
{
return;
}
extern "C" void CONCAT_EXPAND(CURVE, g2_generate_projective_points)(g2_projective_t* points, int size)
{
return;
}
extern "C" void CONCAT_EXPAND(CURVE, g2_generate_affine_points)(g2_affine_t* points, int size)
{
return;
}
extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_affine_convert_montgomery)(
g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
{
return 0;
}
extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_projective_convert_montgomery)(
g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
{
return 0;
}

View File

@@ -1,40 +1,37 @@
if (EXT_FIELD)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DEXT_FIELD")
endif ()
SET(SUPPORTED_FIELDS_WITHOUT_NTT grumpkin)
set(TARGET icicle_field)
set(SRC ${CMAKE_SOURCE_DIR}/src)
set(SRC ../../)
set(FIELD_SOURCE ${SRC}/fields/extern.cu)
list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern.cu)
set(FIELD_SOURCE ${SRC}/fields/extern.cpp)
# list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern.cu)
if(EXT_FIELD)
list(APPEND FIELD_SOURCE ${SRC}/fields/extern_extension.cu)
list(APPEND FIELD_SOURCE ${SRC}/ntt/extern_extension.cu)
list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern_extension.cu)
list(APPEND FIELD_SOURCE ${SRC}/fields/extern_extension.cpp)
# list(APPEND FIELD_SOURCE ${SRC}/ntt/extern_extension.cu)
# list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern_extension.cu)
endif()
set(POLYNOMIAL_SOURCE_FILES
${SRC}/polynomials/polynomials.cu
${SRC}/polynomials/cuda_backend/polynomial_cuda_backend.cu
${SRC}/polynomials/polynomials_c_api.cu)
# set(POLYNOMIAL_SOURCE_FILES
# ${SRC}/polynomials/polynomials.cu
# ${SRC}/polynomials/cuda_backend/polynomial_cuda_backend.cu
# ${SRC}/polynomials/polynomials_c_api.cu)
list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES})
# list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES})
# TODO: impl poseidon for small fields. note that it needs to be defined over the extension field!
if (DEFINED CURVE)
list(APPEND FIELD_SOURCE ${SRC}/poseidon/poseidon.cu)
list(APPEND FIELD_SOURCE ${SRC}/poseidon/tree/merkle.cu)
endif()
# if (DEFINED CURVE)
# list(APPEND FIELD_SOURCE ${SRC}/poseidon/poseidon.cu)
# list(APPEND FIELD_SOURCE ${SRC}/poseidon/tree/merkle.cu)
# endif()
if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
list(APPEND FIELD_SOURCE ${SRC}/ntt/extern.cu)
list(APPEND FIELD_SOURCE ${SRC}/ntt/kernel_ntt.cu)
endif()
# if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
# list(APPEND FIELD_SOURCE ${SRC}/ntt/extern.cu)
# list(APPEND FIELD_SOURCE ${SRC}/ntt/kernel_ntt.cu)
# endif()
add_library(${TARGET} STATIC ${FIELD_SOURCE})
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_field_${FIELD}")
target_compile_definitions(${TARGET} PUBLIC FIELD=${FIELD})
target_compile_features(${TARGET} PUBLIC cxx_std_17)

View File

@@ -0,0 +1,19 @@
#define FIELD_ID BN254
#include "../../include/fields/field_config.cuh"
using namespace field_config;
//#include "../../include/utils/mont.cuh"
#include "../../include/utils/utils.h"
#include "../../include/gpu-utils/device_context.cuh"
extern "C" void CONCAT_EXPAND(FIELD, generate_scalars)(scalar_t* scalars, int size)
{
return;
}
extern "C" int CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(
scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
{
return 0;
}

View File

@@ -0,0 +1,18 @@
#include "fields/field_config.cuh"
using namespace field_config;
#include "utils/mont.cuh"
#include "utils/utils.h"
#include "gpu-utils/device_context.cuh"
extern "C" void CONCAT_EXPAND(FIELD, extension_generate_scalars)(extension_t* scalars, int size)
{
return;
}
extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)(
extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
{
return 0;
}

View File

@@ -1,5 +1,6 @@
set(TARGET icicle_hash)
add_library(${TARGET} STATIC keccak/keccak.cu)
add_library(${TARGET} STATIC keccak/keccak.cpp)
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_hash")
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_hash")
target_compile_features(${TARGET} PUBLIC cxx_std_17)

View File

@@ -0,0 +1,24 @@
#include "../../../include/hash/keccak/keccak.cuh"
typedef int cudaError_t;
namespace keccak {
template <int C, int D>
cudaError_t
keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
{
return 0;
}
extern "C" cudaError_t
keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
{
return 0;
}
extern "C" cudaError_t
keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
{
return 0;
}
} // namespace keccak

View File

@@ -144,14 +144,14 @@ namespace keccak {
element ^= rc; \
}
__device__ const uint64_t RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
const uint64_t RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
__device__ void keccakf(uint64_t s[25])
void keccakf(uint64_t s[25])
{
uint64_t t0, t1, t2, t3, t4;

View File

@@ -0,0 +1,28 @@
set(TARGET icicle_msm)
set(CURVE_TARGET icicle_curve)
set(FIELD_TARGET icicle_field)
set(SRC ../)
set(MSM_SOURCE ${SRC}/msm/extern.cpp)
if(G2)
list(APPEND MSM_SOURCE ${SRC}/msm/extern_g2.cpp)
endif()
# if(MSM)
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
# if(G2)
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
# endif()
# endif()
# if(ECNTT)
# list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
# list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
# endif()
add_library(${TARGET} STATIC ${MSM_SOURCE})
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
target_compile_features(${TARGET} PUBLIC cxx_std_17)

46
icicle/src/msm/extern.cpp Normal file
View File

@@ -0,0 +1,46 @@
#define CURVE_ID BN254
#define FIELD_ID BN254
#include "../../include/curves/curve_config.cuh"
#include "../../include/fields/field_config.cuh"
#include "../../include/gpu-utils/device_context.cuh"
#include "../../include/msm/msm.cuh"
typedef int cudaError_t;
using namespace curve_config;
using namespace field_config;
#include "../../include/utils/utils.h"
namespace msm {
/**
* Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
* template parameters (where the curve is given by `-DCURVE` env variable during build):
* - `A` is the [affine representation](@ref affine_t) of curve points;
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, precompute_msm_bases_cuda)(
affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
affine_t* output_bases)
{
return 0;
}
/**
* Extern "C" version of [msm](@ref msm) function with the following values of template parameters
* (where the curve is given by `-DCURVE` env variable during build):
* - `S` is the [scalar field](@ref scalar_t) of the curve;
* - `A` is the [affine representation](@ref affine_t) of curve points;
* - `P` is the [projective representation](@ref projective_t) of curve points.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, msm_cuda)(
const scalar_t* scalars, const affine_t* points, int msm_size, MSMConfig& config, projective_t* out)
{
return 0;
}
} // namespace msm

View File

@@ -0,0 +1,43 @@
#include "curves/curve_config.cuh"
#include "fields/field_config.cuh"
using namespace curve_config;
using namespace field_config;
#include "msm.cu"
#include "utils/utils.h"
namespace msm {
/**
* Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
* template parameters (where the curve is given by `-DCURVE` env variable during build):
* - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_precompute_msm_bases_cuda)(
g2_affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
g2_affine_t* output_bases)
{
return precompute_msm_bases<g2_affine_t, g2_projective_t>(
bases, bases_size, precompute_factor, _c, are_bases_on_device, ctx, output_bases);
}
/**
* Extern "C" version of [msm](@ref msm) function with the following values of template parameters
* (where the curve is given by `-DCURVE` env variable during build):
* - `S` is the [scalar field](@ref scalar_t) of the curve;
* - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
* - `P` is the [projective representation](@ref g2_projective_t) of G2 curve points.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_msm_cuda)(
const scalar_t* scalars, const g2_affine_t* points, int msm_size, MSMConfig& config, g2_projective_t* out)
{
return msm<scalar_t, g2_affine_t, g2_projective_t>(scalars, points, msm_size, config, out);
}
} // namespace msm

View File

@@ -20,32 +20,32 @@ public:
unsigned p = 10;
// unsigned p = 1<<30;
static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
static Dummy_Scalar zero() { return {0}; }
static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
static Dummy_Scalar one() { return {1}; }
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
friend std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
{
os << scalar.x;
return os;
}
HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
{
return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
}
friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
friend Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
{
return {(p1.x + p2.x) % p1.p};
}
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
friend bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
friend bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
static HOST_INLINE Dummy_Scalar rand_host()
static Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
static Dummy_Scalar rand_host()
{
return {(unsigned)rand() % 10};
// return {(unsigned)rand()};
@@ -57,32 +57,32 @@ class Dummy_Projective
public:
Dummy_Scalar x;
static HOST_DEVICE_INLINE Dummy_Projective zero() { return {0}; }
static Dummy_Projective zero() { return {0}; }
static HOST_DEVICE_INLINE Dummy_Projective one() { return {1}; }
static Dummy_Projective one() { return {1}; }
static HOST_DEVICE_INLINE Dummy_Projective to_affine(const Dummy_Projective& point) { return {point.x}; }
static Dummy_Projective to_affine(const Dummy_Projective& point) { return {point.x}; }
static HOST_DEVICE_INLINE Dummy_Projective from_affine(const Dummy_Projective& point) { return {point.x}; }
static Dummy_Projective from_affine(const Dummy_Projective& point) { return {point.x}; }
static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective& point) { return {Dummy_Scalar::neg(point.x)}; }
static Dummy_Projective neg(const Dummy_Projective& point) { return {Dummy_Scalar::neg(point.x)}; }
friend HOST_DEVICE_INLINE Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2)
friend Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2)
{
return {p1.x + p2.x};
}
// friend HOST_DEVICE_INLINE Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) {
// friend Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) {
// return p1 + neg(p2);
// }
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point)
friend std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point)
{
os << point.x;
return os;
}
friend HOST_DEVICE_INLINE Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point)
friend Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point)
{
Dummy_Projective res = zero();
#ifdef CUDA_ARCH
@@ -95,14 +95,14 @@ public:
return res;
}
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2)
friend bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2)
{
return (p1.x == p2.x);
}
static HOST_DEVICE_INLINE bool is_zero(const Dummy_Projective& point) { return point.x == 0; }
static bool is_zero(const Dummy_Projective& point) { return point.x == 0; }
static HOST_INLINE Dummy_Projective rand_host()
static Dummy_Projective rand_host()
{
return {(unsigned)rand() % 10};
// return {(unsigned)rand()};

View File

@@ -0,0 +1,34 @@
set(TARGET icicle_ntt)
set(CURVE_TARGET icicle_curve)
set(FIELD_TARGET icicle_field)
set(SRC ../)
set(NTT_SOURCE ${SRC}/ntt/extern.cpp)
set(NTT_SOURCE_EXTENSION ${SRC}/ntt/extern_extension.cpp)
set(NTT_SOURCE_EC ${SRC}/ntt/extern_ecntt.cpp)
set(NTT_SOURCE ${SRC}/ntt/extern.cpp)
if(G2)
list(APPEND NTT_SOURCE ${SRC}/ntt/extern_g2.cpp)
endif()
# if(MSM)
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
# if(G2)
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
# endif()
# endif()
# if(ECNTT)
# list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
# list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
# endif()
add_library(${TARGET} STATIC ${NTT_SOURCE})
add_library(${TARGET} STATIC ${NTT_SOURCE_EXTENSION})
add_library(${TARGET} STATIC ${NTT_SOURCE_EC})
add_library(${TARGET} STATIC )
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
target_compile_features(${TARGET} PUBLIC cxx_std_17)

View File

@@ -1,13 +1,21 @@
build_verification:
mkdir -p work
nvcc -o work/test_verification -I. -I../../include tests/verification.cu -std=c++17
g++ -o work/test_verification -Intt.cpp -Ikernel_ntt.cpp -Iextern.cpp -Iextern_ecntt.cpp -I ectern_extension.cpp -I../../include tests/verification.cpp -std=c++17
build_extern:
g++ -o work/test_verification -I../../include extern.cpp -std=c++17
extern.o: extern.cpp
g++ -std=c++17 -std=gnu++11 -c -o $@ $< -I../../include
test_verification: build_verification
work/test_verification
build_verification_ecntt:
mkdir -p work
nvcc -o work/test_verification_ecntt -I. -I../../include tests/verification.cu -std=c++17 -DECNTT
g++ -o work/test_verification_ecntt -I. -I../../include tests/verification.cpp -std=c++17 -DECNTT
test_verification_ecntt: build_verification_ecntt
work/test_verification_ecntt

60
icicle/src/ntt/extern.cpp Normal file
View File

@@ -0,0 +1,60 @@
#define FIELD_ID BN254
#include "../../include/fields/field_config.cuh"
using namespace field_config;
#include "ntt.cpp"
#include "../../include/gpu-utils/device_context.cuh"
#include "../../include/utils/utils.h"
typedef int cudaError_t;
namespace ntt {
/**
* Extern "C" version of [init_domain](@ref init_domain) function with the following
* value of template parameter (where the field is given by `-DFIELD` env variable during build):
* - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
* stand-alone "STARK field";
*/
extern "C" cudaError_t CONCAT_EXPAND(FIELD, initialize_domain)(
scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
{
return 0;
}
/**
* Extern "C" version of [ntt](@ref ntt) function with the following values of template parameters
* (where the field is given by `-DFIELD` env variable during build):
* - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
* stand-alone "STARK field";
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(FIELD, ntt_cuda)(
const scalar_t* input, int size, NTTDir dir, NTTConfig<scalar_t>& config, scalar_t* output)
{
return ntt<scalar_t, scalar_t>(input, size, dir, config, output);
}
/**
* Extern "C" version of [release_domain](@ref release_domain) function with the following values of template
* parameters (where the field is given by `-DFIELD` env variable during build):
* - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
* stand-alone "STARK field";
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(FIELD, release_domain)(device_context::DeviceContext& ctx)
{
return release_domain<scalar_t>(ctx);
}
/**
* Extern "C" version of [get_root_of_unity](@ref get_root_of_unity) function with the following
* value of template parameter (where the field is given by `-DFIELD` env variable during build):
* - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
* stand-alone "STARK field";
*/
extern "C" scalar_t CONCAT_EXPAND(FIELD, get_root_of_unity)(uint32_t logn)
{
return get_root_of_unity<scalar_t>(logn);
}
} // namespace ntt

View File

@@ -0,0 +1,28 @@
#define FIELD_ID BN254
#define CURVE_ID BN254
#include "../../include/curves/curve_config.cuh"
#include "../../include/fields/field_config.cuh"
using namespace curve_config;
using namespace field_config;
#include "ntt.cpp"
#include "../../include/gpu-utils/device_context.cuh"
#include "../../include/utils/utils.h"
namespace ntt {
/**
* Extern "C" version of [ntt](@ref ntt) function with the following values of template parameters
* (where the curve is given by `-DCURVE` env variable during build):
* - `S` is the [projective representation](@ref projective_t) of the curve (i.e. EC NTT is computed);
* - `E` is the [scalar field](@ref scalar_t) of the curve;
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, ecntt_cuda)(
const projective_t* input, int size, NTTDir dir, NTTConfig<scalar_t>& config, projective_t* output)
{
return ntt<scalar_t, projective_t>(input, size, dir, config, output);
}
} // namespace ntt

View File

@@ -0,0 +1,24 @@
#define FIELD_ID BABY_BEAR
#include "../../include/fields/field_config.cuh"
using namespace field_config;
#include "ntt.cpp"
#include "../../include/gpu-utils/device_context.cuh"
#include "../../include/utils/utils.h"
namespace ntt {
/**
* Extern "C" version of [ntt](@ref ntt) function with the following values of template parameters
* (where the field is given by `-DFIELD` env variable during build):
* - `E` is the [field](@ref scalar_t);
* - `S` is the [extension](@ref extension_t) of `E` of appropriate degree;
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_ntt_cuda)(
const extension_t* input, int size, NTTDir dir, NTTConfig<scalar_t>& config, extension_t* output)
{
return ntt<scalar_t, extension_t>(input, size, dir, config, output);
}
} // namespace ntt

File diff suppressed because it is too large Load Diff

View File

@@ -8,7 +8,7 @@ using namespace field_config;
namespace mxntt {
static inline __device__ uint32_t dig_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw)
static inline uint32_t dig_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw)
{
uint32_t rev_num = 0, temp, dig_len;
if (dit) {
@@ -31,11 +31,11 @@ namespace mxntt {
return rev_num;
}
static inline __device__ uint32_t bit_rev(uint32_t num, uint32_t log_size) { return __brev(num) >> (32 - log_size); }
static inline uint32_t bit_rev(uint32_t num, uint32_t log_size) { return __brev(num) >> (32 - log_size); }
enum eRevType { None, RevToMixedRev, MixedRevToRev, NaturalToMixedRev, NaturalToRev, MixedRevToNatural };
static __device__ uint32_t generalized_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw, eRevType rev_type)
static uint32_t generalized_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw, eRevType rev_type)
{
switch (rev_type) {
case eRevType::RevToMixedRev:

338
icicle/src/ntt/ntt.cpp Normal file
View File

@@ -0,0 +1,338 @@
#define FIELD_ID BN254
#include "../../include/fields/field_config.cuh"
using namespace field_config;
#include "../../include/ntt/ntt.cuh"
#include <unordered_map>
#include <vector>
#include <type_traits>
#include "../../include/gpu-utils/sharedmem.cuh"
#include "../../include/utils/utils_kernels.cuh"
#include "../../include/utils/utils.h"
#include "../../include/ntt/ntt_impl.cuh"
#include "../../include/gpu-utils/device_context.cuh"
#include <mutex>
#ifdef CURVE_ID
#include "../../include/curves/curve_config.cuh"
using namespace curve_config;
#define IS_ECNTT std::is_same_v<E, projective_t>
#else
#define IS_ECNTT false
#endif
namespace ntt {
namespace {
// TODO: Set MAX THREADS based on GPU arch
const uint32_t MAX_NUM_THREADS = 512; // TODO: hotfix - should be 1024, currently limits shared memory size
const uint32_t MAX_THREADS_BATCH = 512;
const uint32_t MAX_THREADS_BATCH_ECNTT =
128; // TODO: hardcoded - allows (2^18 x 64) ECNTT for sm86, decrease this to allow larger ecntt length, batch
// size limited by on-device memory
const uint32_t MAX_SHARED_MEM_ELEMENT_SIZE = 32; // TODO: occupancy calculator, hardcoded for sm_86..sm_89
const uint32_t MAX_SHARED_MEM = MAX_SHARED_MEM_ELEMENT_SIZE * MAX_NUM_THREADS;
template <typename E>
void reverse_order_kernel(const E* arr, E* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size)
{
return;
}
/**
* Bit-reverses a batch of input arrays out-of-place inside GPU.
* for example: on input array ([a[0],a[1],a[2],a[3]], 4, 2) it returns
* [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places).
* @param arr_in batch of arrays of some object of type T. Should be on GPU.
* @param n length of `arr`.
* @param logn log(n).
* @param batch_size the size of the batch.
* @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into.
*/
template <typename E>
void reverse_order_batch(
const E* arr_in, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream, E* arr_out)
{
return;
}
/**
* Bit-reverses an input array out-of-place inside GPU.
* for example: on array ([a[0],a[1],a[2],a[3]], 4, 2) it returns
* [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places).
* @param arr_in array of some object of type T of size which is a power of 2. Should be on GPU.
* @param n length of `arr`.
* @param logn log(n).
* @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into.
*/
template <typename E>
void reverse_order(const E* arr_in, uint32_t n, uint32_t logn, cudaStream_t stream, E* arr_out)
{
reverse_order_batch(arr_in, n, logn, 1, stream, arr_out);
}
/**
* Cooley-Tuckey NTT.
* NOTE! this function assumes that d_twiddles are located in the device memory.
* @param arr_in input array of type E (elements).
* @param n length of d_arr.
* @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
* @param n_twiddles length of twiddles, should be negative for intt.
* @param max_task max count of parallel tasks.
* @param s log2(n) loop index.
* @param arr_out buffer for the output.
*/
template <typename E, typename S>
void ntt_template_kernel_shared_rev(
const E* __restrict__ arr_in,
int n,
const S* __restrict__ r_twiddles,
int n_twiddles,
int max_task,
int ss,
int logn,
E* __restrict__ arr_out)
{
return;
}
/**
* Cooley-Tuckey NTT.
* NOTE! this function assumes that d_twiddles are located in the device memory.
* @param arr_in input array of type E (elements).
* @param n length of d_arr.
* @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
* @param n_twiddles length of twiddles, should be negative for intt.
* @param max_task max count of parallel tasks.
* @param s log2(n) loop index.
* @param arr_out buffer for the output.
*/
template <typename E, typename S>
void ntt_template_kernel_shared(
const E* __restrict__ arr_in,
int n,
const S* __restrict__ r_twiddles,
int n_twiddles,
int max_task,
int s,
int logn,
E* __restrict__ arr_out)
{
return;
}
/**
* Cooley-Tukey NTT.
* NOTE! this function assumes that d_twiddles are located in the device memory.
* @param arr input array of type E (elements).
* @param n length of d_arr.
* @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
* @param n_twiddles length of twiddles, should be negative for intt.
* @param max_task max count of parallel tasks.
* @param s log2(n) loop index.
*/
template <typename E, typename S>
void
ntt_template_kernel(const E* arr_in, int n, S* twiddles, int n_twiddles, int max_task, int s, bool rev, E* arr_out)
{
return;
}
/**
* NTT/INTT inplace batch
* Note: this function does not perform any bit-reverse permutations on its inputs or outputs.
* @param d_input Input array
* @param n Size of `d_input`
* @param d_twiddles Twiddles
* @param n_twiddles Size of `d_twiddles`
* @param batch_size The size of the batch; the length of `d_inout` is `n` * `batch_size`.
* @param inverse true for iNTT
* @param coset should be array of length n or a nullptr if NTT is not computed on a coset
* @param stream CUDA stream
* @param is_async if false, perform sync of the supplied CUDA stream at the end of processing
* @param d_output Output array
*/
template <typename E, typename S>
cudaError_t ntt_inplace_batch_template(
const E* d_input,
int n,
S* d_twiddles,
int n_twiddles,
int batch_size,
int logn,
bool inverse,
bool dit,
S* arbitrary_coset,
int coset_gen_index,
cudaStream_t stream,
E* d_output)
{
return 0;
}
} // namespace
/**
* @struct Domain
* Struct containing information about the domain on which (i)NTT is evaluated i.e. twiddle factors.
* Twiddle factors are private, static and can only be set using [init_domain](@ref init_domain) function.
* The internal representation of twiddles is prone to change in accordance with changing [NTT](@ref NTT) algorithm.
* @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$. Must be a field.
*/
template <typename S>
class Domain
{
// Mutex for protecting access to the domain/device container array
static inline std::mutex device_domain_mutex;
// The domain-per-device container - assumption is init_domain is called once per device per program.
int max_size = 0;
int max_log_size = 0;
S* twiddles = nullptr;
bool initialized = false; // protection for multi-threaded case
std::unordered_map<S, int> coset_index = {};
S* internal_twiddles = nullptr; // required by mixed-radix NTT
S* basic_twiddles = nullptr; // required by mixed-radix NTT
// mixed-radix NTT supports a fast-twiddle option at the cost of additional 4N memory (where N is max NTT size)
S* fast_external_twiddles = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
S* fast_internal_twiddles = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
S* fast_basic_twiddles = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
S* fast_external_twiddles_inv = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
S* fast_internal_twiddles_inv = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
S* fast_basic_twiddles_inv = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
public:
template <typename U>
friend cudaError_t init_domain(U primitive_root, device_context::DeviceContext& ctx, bool fast_tw);
template <typename U>
friend cudaError_t release_domain(device_context::DeviceContext& ctx);
template <typename U>
friend U get_root_of_unity(uint64_t logn, device_context::DeviceContext& ctx);
template <typename U>
friend U get_root_of_unity_from_domain(uint64_t logn, device_context::DeviceContext& ctx);
template <typename U, typename E>
friend cudaError_t ntt(const E* input, int size, NTTDir dir, NTTConfig<U>& config, E* output);
};
template <typename S>
// static inline Domain<S> domains_for_devices[device_context::MAX_DEVICES] = {};
static inline Domain<S> domains_for_devices[1] = {};
template <typename S>
cudaError_t init_domain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
{
return 0;
}
template <typename S>
cudaError_t release_domain(device_context::DeviceContext& ctx)
{
return 0;
}
template <typename S>
S get_root_of_unity(uint64_t max_size)
{
// ceil up
const auto log_max_size = static_cast<uint32_t>(std::ceil(std::log2(max_size)));
return S::omega(log_max_size);
}
// explicit instantiation to avoid having to include this file
template scalar_t get_root_of_unity(uint64_t logn);
template <typename S>
S get_root_of_unity_from_domain(uint64_t logn, device_context::DeviceContext& ctx)
{
Domain<S>& domain = domains_for_devices<S>[ctx.device_id];
if (logn > domain.max_log_size) {
std::ostringstream oss;
oss << "NTT log_size=" << logn
<< " is too large for the domain. Consider generating your domain with a higher order root of unity.\n";
THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, oss.str().c_str());
}
const size_t twiddles_idx = 1ULL << (domain.max_log_size - logn);
return domain.twiddles[twiddles_idx];
}
// explicit instantiation to avoid having to include this file
template scalar_t get_root_of_unity_from_domain(uint64_t logn, device_context::DeviceContext& ctx);
template <typename S>
static bool is_choosing_radix2_algorithm(int logn, int batch_size, const NTTConfig<S>& config)
{
const bool is_mixed_radix_alg_supported = (logn > 3 && logn != 7);
if (!is_mixed_radix_alg_supported && config.columns_batch)
throw IcicleError(IcicleError_t::InvalidArgument, "columns batch is not supported for given NTT size");
const bool is_user_selected_radix2_alg = config.ntt_algorithm == NttAlgorithm::Radix2;
const bool is_force_radix2 = !is_mixed_radix_alg_supported || is_user_selected_radix2_alg;
if (is_force_radix2) return true;
const bool is_user_selected_mixed_radix_alg = config.ntt_algorithm == NttAlgorithm::MixedRadix;
if (is_user_selected_mixed_radix_alg) return false;
if (config.columns_batch) return false; // radix2 does not currently support columns batch mode.
// Heuristic to automatically select an algorithm
// Note that generally the decision depends on {logn, batch, ordering, inverse, coset, in-place, coeff-field} and
// the specific GPU.
// the following heuristic is a simplification based on measurements. Users can try both and select the algorithm
// based on the specific case via the 'NTTConfig.ntt_algorithm' field
if (logn >= 16) return false; // mixed-radix is typically faster in those cases
if (logn <= 11) return true; // radix-2 is typically faster for batch<=256 in those cases
const int log_batch = (int)log2(batch_size);
return (logn + log_batch <= 18); // almost the cutoff point where both are equal
}
template <typename S, typename E>
cudaError_t radix2_ntt(
const E* d_input,
E* d_output,
S* twiddles,
int ntt_size,
int max_size,
int batch_size,
bool is_inverse,
Ordering ordering,
S* arbitrary_coset,
int coset_gen_index,
cudaStream_t cuda_stream)
{
return 0;
}
template <typename S, typename E>
cudaError_t ntt(const E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output)
{
return 0;
}
template <typename S>
NTTConfig<S> default_ntt_config(const device_context::DeviceContext& ctx)
{
NTTConfig<S> config = {
ctx, // ctx
S::one(), // coset_gen
1, // batch_size
false, // columns_batch
Ordering::kNN, // ordering
false, // are_inputs_on_device
false, // are_outputs_on_device
false, // is_async
NttAlgorithm::Auto, // ntt_algorithm
};
return config;
}
// explicit instantiation to avoid having to include this file
template NTTConfig<scalar_t> default_ntt_config(const device_context::DeviceContext& ctx);
} // namespace ntt

View File

@@ -0,0 +1,46 @@
#include "../../../include/fields/id.h"
#define FIELD_ID BN254
#ifdef ECNTT
#define CURVE_ID BN254
#include "../../../include/curves/curve_config.cuh"
typedef field_config::scalar_t test_scalar;
typedef curve_config::projective_t test_data;
#else
#include "../../../include/fields/field_config.cuh"
typedef field_config::scalar_t test_scalar;
typedef field_config::scalar_t test_data;
#endif
#include "../../../include/fields/field.cuh"
#include "../../../include/curves/projective.cuh"
#include <chrono>
#include <iostream>
#include <vector>
#include "../ntt.cpp"
// #include "../kernel_ntt.cpp"
#include <memory>
void random_samples(test_data* res, uint32_t count)
{
for (int i = 0; i < count; i++)
res[i] = i < 1000 ? test_data::rand_host() : res[i - 1000];
}
void incremental_values(test_scalar* res, uint32_t count)
{
for (int i = 0; i < count; i++) {
res[i] = i ? res[i - 1] + test_scalar::one() : test_scalar::zero();
}
}
void transpose_batch(test_scalar* in, test_scalar* out, int row_size, int column_size)
{
return;
}
int main(int argc, char** argv)
{
return 0;
}

View File

@@ -1,207 +0,0 @@
#include "fields/id.h"
#define FIELD_ID BN254
#ifdef ECNTT
#define CURVE_ID BN254
#include "curves/curve_config.cuh"
typedef field_config::scalar_t test_scalar;
typedef curve_config::projective_t test_data;
#else
#include "fields/field_config.cuh"
typedef field_config::scalar_t test_scalar;
typedef field_config::scalar_t test_data;
#endif
#include "fields/field.cuh"
#include "curves/projective.cuh"
#include <chrono>
#include <iostream>
#include <vector>
#include "ntt.cu"
#include "kernel_ntt.cu"
#include <memory>
void random_samples(test_data* res, uint32_t count)
{
for (int i = 0; i < count; i++)
res[i] = i < 1000 ? test_data::rand_host() : res[i - 1000];
}
void incremental_values(test_scalar* res, uint32_t count)
{
for (int i = 0; i < count; i++) {
res[i] = i ? res[i - 1] + test_scalar::one() : test_scalar::zero();
}
}
__global__ void transpose_batch(test_scalar* in, test_scalar* out, int row_size, int column_size)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= row_size * column_size) return;
out[(tid % row_size) * column_size + (tid / row_size)] = in[tid];
}
int main(int argc, char** argv)
{
cudaEvent_t icicle_start, icicle_stop, new_start, new_stop;
float icicle_time, new_time;
int NTT_LOG_SIZE = (argc > 1) ? atoi(argv[1]) : 19;
int NTT_SIZE = 1 << NTT_LOG_SIZE;
bool INPLACE = (argc > 2) ? atoi(argv[2]) : false;
int INV = (argc > 3) ? atoi(argv[3]) : false;
int BATCH_SIZE = (argc > 4) ? atoi(argv[4]) : 150;
bool COLUMNS_BATCH = (argc > 5) ? atoi(argv[5]) : false;
int COSET_IDX = (argc > 6) ? atoi(argv[6]) : 2;
const ntt::Ordering ordering = (argc > 7) ? ntt::Ordering(atoi(argv[7])) : ntt::Ordering::kNN;
bool FAST_TW = (argc > 8) ? atoi(argv[8]) : true;
// Note: NM, MN are not expected to be equal when comparing mixed-radix and radix-2 NTTs
const char* ordering_str = ordering == ntt::Ordering::kNN ? "NN"
: ordering == ntt::Ordering::kNR ? "NR"
: ordering == ntt::Ordering::kRN ? "RN"
: ordering == ntt::Ordering::kRR ? "RR"
: ordering == ntt::Ordering::kNM ? "NM"
: "MN";
printf(
"running ntt 2^%d, inplace=%d, inverse=%d, batch_size=%d, columns_batch=%d coset-idx=%d, ordering=%s, fast_tw=%d\n",
NTT_LOG_SIZE, INPLACE, INV, BATCH_SIZE, COLUMNS_BATCH, COSET_IDX, ordering_str, FAST_TW);
CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context (warmup)
// init domain
auto ntt_config = ntt::default_ntt_config<test_scalar>();
ntt_config.ordering = ordering;
ntt_config.are_inputs_on_device = true;
ntt_config.are_outputs_on_device = true;
ntt_config.batch_size = BATCH_SIZE;
ntt_config.columns_batch = COLUMNS_BATCH;
CHK_IF_RETURN(cudaEventCreate(&icicle_start));
CHK_IF_RETURN(cudaEventCreate(&icicle_stop));
CHK_IF_RETURN(cudaEventCreate(&new_start));
CHK_IF_RETURN(cudaEventCreate(&new_stop));
auto start = std::chrono::high_resolution_clock::now();
const scalar_t basic_root = test_scalar::omega(NTT_LOG_SIZE);
ntt::init_domain(basic_root, ntt_config.ctx, FAST_TW);
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
std::cout << "initDomain took: " << duration / 1000 << " MS" << std::endl;
// cpu allocation
auto CpuScalars = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
auto CpuOutputOld = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
auto CpuOutputNew = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
// gpu allocation
scalar_t *GpuScalars, *GpuOutputOld, *GpuOutputNew;
scalar_t* GpuScalarsTransposed;
CHK_IF_RETURN(cudaMalloc(&GpuScalars, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
CHK_IF_RETURN(cudaMalloc(&GpuScalarsTransposed, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
CHK_IF_RETURN(cudaMalloc(&GpuOutputOld, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
CHK_IF_RETURN(cudaMalloc(&GpuOutputNew, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
// init inputs
// incremental_values(CpuScalars.get(), NTT_SIZE * BATCH_SIZE);
random_samples(CpuScalars.get(), NTT_SIZE * BATCH_SIZE);
CHK_IF_RETURN(
cudaMemcpy(GpuScalars, CpuScalars.get(), NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyHostToDevice));
if (COLUMNS_BATCH) {
transpose_batch<<<(NTT_SIZE * BATCH_SIZE + 256 - 1) / 256, 256>>>(
GpuScalars, GpuScalarsTransposed, NTT_SIZE, BATCH_SIZE);
}
// inplace
if (INPLACE) {
CHK_IF_RETURN(cudaMemcpy(
GpuOutputNew, COLUMNS_BATCH ? GpuScalarsTransposed : GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data),
cudaMemcpyDeviceToDevice));
}
for (int coset_idx = 0; coset_idx < COSET_IDX; ++coset_idx) {
ntt_config.coset_gen = ntt_config.coset_gen * basic_root;
}
auto benchmark = [&](bool is_print, int iterations) -> cudaError_t {
// NEW
CHK_IF_RETURN(cudaEventRecord(new_start, ntt_config.ctx.stream));
ntt_config.ntt_algorithm = ntt::NttAlgorithm::MixedRadix;
for (size_t i = 0; i < iterations; i++) {
CHK_IF_RETURN(ntt::ntt(
INPLACE ? GpuOutputNew
: COLUMNS_BATCH ? GpuScalarsTransposed
: GpuScalars,
NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config, GpuOutputNew));
}
CHK_IF_RETURN(cudaEventRecord(new_stop, ntt_config.ctx.stream));
CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
CHK_IF_RETURN(cudaEventElapsedTime(&new_time, new_start, new_stop));
// OLD
CHK_IF_RETURN(cudaEventRecord(icicle_start, ntt_config.ctx.stream));
ntt_config.ntt_algorithm = ntt::NttAlgorithm::Radix2;
for (size_t i = 0; i < iterations; i++) {
CHK_IF_RETURN(
ntt::ntt(GpuScalars, NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config, GpuOutputOld));
}
CHK_IF_RETURN(cudaEventRecord(icicle_stop, ntt_config.ctx.stream));
CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
CHK_IF_RETURN(cudaEventElapsedTime(&icicle_time, icicle_start, icicle_stop));
if (is_print) {
printf("Old Runtime=%0.3f MS\n", icicle_time / iterations);
printf("New Runtime=%0.3f MS\n", new_time / iterations);
}
return CHK_LAST();
};
CHK_IF_RETURN(benchmark(false /*=print*/, 1)); // warmup
int count = INPLACE ? 1 : 10;
if (INPLACE) {
CHK_IF_RETURN(cudaMemcpy(
GpuOutputNew, COLUMNS_BATCH ? GpuScalarsTransposed : GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data),
cudaMemcpyDeviceToDevice));
}
CHK_IF_RETURN(benchmark(true /*=print*/, count));
if (COLUMNS_BATCH) {
transpose_batch<<<(NTT_SIZE * BATCH_SIZE + 256 - 1) / 256, 256>>>(
GpuOutputNew, GpuScalarsTransposed, BATCH_SIZE, NTT_SIZE);
CHK_IF_RETURN(cudaMemcpy(
GpuOutputNew, GpuScalarsTransposed, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToDevice));
}
// verify
CHK_IF_RETURN(
cudaMemcpy(CpuOutputNew.get(), GpuOutputNew, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToHost));
CHK_IF_RETURN(
cudaMemcpy(CpuOutputOld.get(), GpuOutputOld, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToHost));
bool success = true;
for (int i = 0; i < NTT_SIZE * BATCH_SIZE; i++) {
// if (i%64==0) printf("\n");
if (CpuOutputNew[i] != CpuOutputOld[i]) {
success = false;
// std::cout << i << " ref " << CpuOutputOld[i] << " != " << CpuOutputNew[i] << std::endl;
// break;
} else {
// std::cout << i << " ref " << CpuOutputOld[i] << " == " << CpuOutputNew[i] << std::endl;
// break;
}
}
const char* success_str = success ? "SUCCESS!" : "FAIL!";
printf("%s\n", success_str);
CHK_IF_RETURN(cudaFree(GpuScalars));
CHK_IF_RETURN(cudaFree(GpuOutputOld));
CHK_IF_RETURN(cudaFree(GpuOutputNew));
ntt::release_domain<test_scalar>(ntt_config.ctx);
return CHK_LAST();
}

View File

@@ -1,721 +0,0 @@
#ifndef T_NTT
#define T_NTT
#pragma once
#include <stdio.h>
#include <stdint.h>
#include "gpu-utils/modifiers.cuh"
struct stage_metadata {
uint32_t th_stride;
uint32_t ntt_block_size;
uint32_t batch_id;
uint32_t ntt_block_id;
uint32_t ntt_inp_id;
};
#define STAGE_SIZES_DATA \
{ \
{0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0}, \
{6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \
{6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0}, \
{6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \
{6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6}, \
{6, 6, 6, 6, 6}, \
}
uint32_t constexpr STAGE_SIZES_HOST[31][5] = STAGE_SIZES_DATA;
__device__ constexpr uint32_t STAGE_SIZES_DEVICE[31][5] = STAGE_SIZES_DATA;
// construction for fast-twiddles
uint32_t constexpr STAGE_PREV_SIZES[31] = {0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 5, 6, 6, 9, 9, 10,
11, 11, 12, 15, 15, 16, 16, 18, 18, 20, 21, 21, 22, 23, 24};
#define STAGE_SIZES_DATA_FAST_TW \
{ \
{0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0}, \
{6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \
{6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0}, \
{6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \
{6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6}, \
{6, 6, 6, 6, 6}, \
}
uint32_t constexpr STAGE_SIZES_HOST_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;
__device__ uint32_t constexpr STAGE_SIZES_DEVICE_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;
template <typename E, typename S>
class NTTEngine
{
public:
E X[8];
S WB[3];
S WI[7];
S WE[8];
DEVICE_INLINE void loadBasicTwiddles(S* basic_twiddles)
{
UNROLL
for (int i = 0; i < 3; i++) {
WB[i] = basic_twiddles[i];
}
}
DEVICE_INLINE void loadBasicTwiddlesGeneric(S* basic_twiddles, bool inv)
{
UNROLL
for (int i = 0; i < 3; i++) {
WB[i] = basic_twiddles[inv ? i + 3 : i];
}
}
DEVICE_INLINE void loadInternalTwiddles64(S* data, bool stride)
{
UNROLL
for (int i = 0; i < 7; i++) {
WI[i] = data[((stride ? (threadIdx.x >> 3) : (threadIdx.x)) & 0x7) * (i + 1)];
}
}
DEVICE_INLINE void loadInternalTwiddles32(S* data, bool stride)
{
UNROLL
for (int i = 0; i < 7; i++) {
WI[i] = data[2 * ((stride ? (threadIdx.x >> 4) : (threadIdx.x)) & 0x3) * (i + 1)];
}
}
DEVICE_INLINE void loadInternalTwiddles16(S* data, bool stride)
{
UNROLL
for (int i = 0; i < 7; i++) {
WI[i] = data[4 * ((stride ? (threadIdx.x >> 5) : (threadIdx.x)) & 0x1) * (i + 1)];
}
}
DEVICE_INLINE void loadInternalTwiddlesGeneric64(S* data, bool stride, bool inv)
{
UNROLL
for (int i = 0; i < 7; i++) {
uint32_t exp = ((stride ? (threadIdx.x >> 3) : (threadIdx.x)) & 0x7) * (i + 1);
WI[i] = data[(inv && exp) ? 64 - exp : exp]; // if exp = 0 we also take exp and not 64-exp
}
}
DEVICE_INLINE void loadInternalTwiddlesGeneric32(S* data, bool stride, bool inv)
{
UNROLL
for (int i = 0; i < 7; i++) {
uint32_t exp = 2 * ((stride ? (threadIdx.x >> 4) : (threadIdx.x)) & 0x3) * (i + 1);
WI[i] = data[(inv && exp) ? 64 - exp : exp];
}
}
DEVICE_INLINE void loadInternalTwiddlesGeneric16(S* data, bool stride, bool inv)
{
UNROLL
for (int i = 0; i < 7; i++) {
uint32_t exp = 4 * ((stride ? (threadIdx.x >> 5) : (threadIdx.x)) & 0x1) * (i + 1);
WI[i] = data[(inv && exp) ? 64 - exp : exp];
}
}
DEVICE_INLINE void loadExternalTwiddles64(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
{
data += tw_order * s_meta.ntt_inp_id + (s_meta.ntt_block_id & (tw_order - 1));
UNROLL
for (uint32_t i = 0; i < 8; i++) {
WE[i] = data[8 * i * tw_order + (1 << tw_log_order + 6) - 1];
}
}
DEVICE_INLINE void loadExternalTwiddles32(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
{
data += tw_order * s_meta.ntt_inp_id * 2 + (s_meta.ntt_block_id & (tw_order - 1));
UNROLL
for (uint32_t j = 0; j < 2; j++) {
UNROLL
for (uint32_t i = 0; i < 4; i++) {
WE[4 * j + i] = data[(8 * i + j) * tw_order + (1 << tw_log_order + 5) - 1];
}
}
}
DEVICE_INLINE void loadExternalTwiddles16(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
{
data += tw_order * s_meta.ntt_inp_id * 4 + (s_meta.ntt_block_id & (tw_order - 1));
UNROLL
for (uint32_t j = 0; j < 4; j++) {
UNROLL
for (uint32_t i = 0; i < 2; i++) {
WE[2 * j + i] = data[(8 * i + j) * tw_order + (1 << tw_log_order + 4) - 1];
}
}
}
DEVICE_INLINE void loadExternalTwiddlesGeneric64(
S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
{
UNROLL
for (uint32_t i = 0; i < 8; i++) {
uint32_t exp = (s_meta.ntt_inp_id + 8 * i) * (s_meta.ntt_block_id & (tw_order - 1))
<< (tw_log_size - tw_log_order - 6);
WE[i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
}
}
DEVICE_INLINE void loadExternalTwiddlesGeneric32(
S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
{
UNROLL
for (uint32_t j = 0; j < 2; j++) {
UNROLL
for (uint32_t i = 0; i < 4; i++) {
uint32_t exp = (s_meta.ntt_inp_id * 2 + 8 * i + j) * (s_meta.ntt_block_id & (tw_order - 1))
<< (tw_log_size - tw_log_order - 5);
WE[4 * j + i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
}
}
}
DEVICE_INLINE void loadExternalTwiddlesGeneric16(
S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
{
UNROLL
for (uint32_t j = 0; j < 4; j++) {
UNROLL
for (uint32_t i = 0; i < 2; i++) {
uint32_t exp = (s_meta.ntt_inp_id * 4 + 8 * i + j) * (s_meta.ntt_block_id & (tw_order - 1))
<< (tw_log_size - tw_log_order - 4);
WE[2 * j + i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
}
}
}
DEVICE_INLINE void
loadGlobalData(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
{
if (strided) {
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
} else {
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
}
UNROLL
for (uint32_t i = 0; i < 8; i++) {
X[i] = data[s_meta.th_stride * i * data_stride];
}
}
DEVICE_INLINE void loadGlobalDataColumnBatch(
const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
{
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
batch_size +
s_meta.batch_id;
UNROLL
for (uint32_t i = 0; i < 8; i++) {
X[i] = data[s_meta.th_stride * i * data_stride * batch_size];
}
}
DEVICE_INLINE void
storeGlobalData(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
{
if (strided) {
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
} else {
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
}
UNROLL
for (uint32_t i = 0; i < 8; i++) {
data[s_meta.th_stride * i * data_stride] = X[i];
}
}
DEVICE_INLINE void storeGlobalDataColumnBatch(
E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
{
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
batch_size +
s_meta.batch_id;
UNROLL
for (uint32_t i = 0; i < 8; i++) {
data[s_meta.th_stride * i * data_stride * batch_size] = X[i];
}
}
DEVICE_INLINE void
loadGlobalData32(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
{
if (strided) {
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
} else {
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
}
UNROLL
for (uint32_t j = 0; j < 2; j++) {
UNROLL
for (uint32_t i = 0; i < 4; i++) {
X[4 * j + i] = data[(8 * i + j) * data_stride];
}
}
}
DEVICE_INLINE void loadGlobalData32ColumnBatch(
const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
{
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
batch_size +
s_meta.batch_id;
UNROLL
for (uint32_t j = 0; j < 2; j++) {
UNROLL
for (uint32_t i = 0; i < 4; i++) {
X[4 * j + i] = data[(8 * i + j) * data_stride * batch_size];
}
}
}
DEVICE_INLINE void
storeGlobalData32(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
{
if (strided) {
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
} else {
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
}
UNROLL
for (uint32_t j = 0; j < 2; j++) {
UNROLL
for (uint32_t i = 0; i < 4; i++) {
data[(8 * i + j) * data_stride] = X[4 * j + i];
}
}
}
DEVICE_INLINE void storeGlobalData32ColumnBatch(
E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
{
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
batch_size +
s_meta.batch_id;
UNROLL
for (uint32_t j = 0; j < 2; j++) {
UNROLL
for (uint32_t i = 0; i < 4; i++) {
data[(8 * i + j) * data_stride * batch_size] = X[4 * j + i];
}
}
}
DEVICE_INLINE void
loadGlobalData16(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
{
if (strided) {
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
} else {
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
}
UNROLL
for (uint32_t j = 0; j < 4; j++) {
UNROLL
for (uint32_t i = 0; i < 2; i++) {
X[2 * j + i] = data[(8 * i + j) * data_stride];
}
}
}
DEVICE_INLINE void loadGlobalData16ColumnBatch(
const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
{
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
batch_size +
s_meta.batch_id;
UNROLL
for (uint32_t j = 0; j < 4; j++) {
UNROLL
for (uint32_t i = 0; i < 2; i++) {
X[2 * j + i] = data[(8 * i + j) * data_stride * batch_size];
}
}
}
DEVICE_INLINE void
storeGlobalData16(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
{
if (strided) {
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
} else {
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
}
UNROLL
for (uint32_t j = 0; j < 4; j++) {
UNROLL
for (uint32_t i = 0; i < 2; i++) {
data[(8 * i + j) * data_stride] = X[2 * j + i];
}
}
}
DEVICE_INLINE void storeGlobalData16ColumnBatch(
E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
{
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
batch_size +
s_meta.batch_id;
UNROLL
for (uint32_t j = 0; j < 4; j++) {
UNROLL
for (uint32_t i = 0; i < 2; i++) {
data[(8 * i + j) * data_stride * batch_size] = X[2 * j + i];
}
}
}
DEVICE_INLINE void ntt4_2()
{
UNROLL
for (int i = 0; i < 2; i++) {
ntt4(X[4 * i], X[4 * i + 1], X[4 * i + 2], X[4 * i + 3]);
}
}
DEVICE_INLINE void ntt2_4()
{
UNROLL
for (int i = 0; i < 4; i++) {
ntt2(X[2 * i], X[2 * i + 1]);
}
}
DEVICE_INLINE void ntt2(E& X0, E& X1)
{
E T;
T = X0 + X1;
X1 = X0 - X1;
X0 = T;
}
DEVICE_INLINE void ntt4(E& X0, E& X1, E& X2, E& X3)
{
E T;
T = X0 + X2;
X2 = X0 - X2;
X0 = X1 + X3;
X1 = X1 - X3; // T has X0, X0 has X1, X2 has X2, X1 has X3
X1 = X1 * WB[0];
X3 = X2 - X1;
X1 = X2 + X1;
X2 = T - X0;
X0 = T + X0;
}
// rbo version
DEVICE_INLINE void ntt4rbo(E& X0, E& X1, E& X2, E& X3)
{
E T;
T = X0 - X1;
X0 = X0 + X1;
X1 = X2 + X3;
X3 = X2 - X3; // T has X0, X0 has X1, X2 has X2, X1 has X3
X3 = X3 * WB[0];
X2 = X0 - X1;
X0 = X0 + X1;
X1 = T + X3;
X3 = T - X3;
}
DEVICE_INLINE void ntt8(E& X0, E& X1, E& X2, E& X3, E& X4, E& X5, E& X6, E& X7)
{
E T;
// out of 56,623,104 possible mappings, we have:
T = X3 - X7;
X7 = X3 + X7;
X3 = X1 - X5;
X5 = X1 + X5;
X1 = X2 + X6;
X2 = X2 - X6;
X6 = X0 + X4;
X0 = X0 - X4;
T = T * WB[1];
X2 = X2 * WB[1];
X4 = X6 + X1;
X6 = X6 - X1;
X1 = X3 + T;
X3 = X3 - T;
T = X5 + X7;
X5 = X5 - X7;
X7 = X0 + X2;
X0 = X0 - X2;
X1 = X1 * WB[0];
X5 = X5 * WB[1];
X3 = X3 * WB[2];
X2 = X6 + X5;
X6 = X6 - X5;
X5 = X7 - X1;
X1 = X7 + X1;
X7 = X0 - X3;
X3 = X0 + X3;
X0 = X4 + T;
X4 = X4 - T;
}
DEVICE_INLINE void ntt8win()
{
E T;
T = X[3] - X[7];
X[7] = X[3] + X[7];
X[3] = X[1] - X[5];
X[5] = X[1] + X[5];
X[1] = X[2] + X[6];
X[2] = X[2] - X[6];
X[6] = X[0] + X[4];
X[0] = X[0] - X[4];
X[2] = X[2] * WB[0];
X[4] = X[6] + X[1];
X[6] = X[6] - X[1];
X[1] = X[3] + T;
X[3] = X[3] - T;
T = X[5] + X[7];
X[5] = X[5] - X[7];
X[7] = X[0] + X[2];
X[0] = X[0] - X[2];
X[1] = X[1] * WB[1];
X[5] = X[5] * WB[0];
X[3] = X[3] * WB[2];
X[2] = X[6] + X[5];
X[6] = X[6] - X[5];
X[5] = X[1] + X[3];
X[3] = X[1] - X[3];
X[1] = X[7] + X[5];
X[5] = X[7] - X[5];
X[7] = X[0] - X[3];
X[3] = X[0] + X[3];
X[0] = X[4] + T;
X[4] = X[4] - T;
}
DEVICE_INLINE void SharedData64Columns8(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0x7 : threadIdx.x >> 3;
uint32_t column_id = stride ? threadIdx.x >> 3 : threadIdx.x & 0x7;
UNROLL
for (uint32_t i = 0; i < 8; i++) {
if (store) {
shmem[ntt_id * 64 + i * 8 + column_id] = X[i];
} else {
X[i] = shmem[ntt_id * 64 + i * 8 + column_id];
}
}
}
DEVICE_INLINE void SharedData64Rows8(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0x7 : threadIdx.x >> 3;
uint32_t row_id = stride ? threadIdx.x >> 3 : threadIdx.x & 0x7;
UNROLL
for (uint32_t i = 0; i < 8; i++) {
if (store) {
shmem[ntt_id * 64 + row_id * 8 + i] = X[i];
} else {
X[i] = shmem[ntt_id * 64 + row_id * 8 + i];
}
}
}
DEVICE_INLINE void SharedData32Columns8(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
uint32_t column_id = stride ? threadIdx.x >> 4 : threadIdx.x & 0x3;
UNROLL
for (uint32_t i = 0; i < 8; i++) {
if (store) {
shmem[ntt_id * 32 + i * 4 + column_id] = X[i];
} else {
X[i] = shmem[ntt_id * 32 + i * 4 + column_id];
}
}
}
DEVICE_INLINE void SharedData32Rows8(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
uint32_t row_id = stride ? threadIdx.x >> 4 : threadIdx.x & 0x3;
UNROLL
for (uint32_t i = 0; i < 8; i++) {
if (store) {
shmem[ntt_id * 32 + row_id * 8 + i] = X[i];
} else {
X[i] = shmem[ntt_id * 32 + row_id * 8 + i];
}
}
}
DEVICE_INLINE void SharedData32Columns4_2(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
uint32_t column_id = (stride ? threadIdx.x >> 4 : threadIdx.x & 0x3) * 2;
UNROLL
for (uint32_t j = 0; j < 2; j++) {
UNROLL
for (uint32_t i = 0; i < 4; i++) {
if (store) {
shmem[ntt_id * 32 + i * 8 + column_id + j] = X[4 * j + i];
} else {
X[4 * j + i] = shmem[ntt_id * 32 + i * 8 + column_id + j];
}
}
}
}
DEVICE_INLINE void SharedData32Rows4_2(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
uint32_t row_id = (stride ? threadIdx.x >> 4 : threadIdx.x & 0x3) * 2;
UNROLL
for (uint32_t j = 0; j < 2; j++) {
UNROLL
for (uint32_t i = 0; i < 4; i++) {
if (store) {
shmem[ntt_id * 32 + row_id * 4 + 4 * j + i] = X[4 * j + i];
} else {
X[4 * j + i] = shmem[ntt_id * 32 + row_id * 4 + 4 * j + i];
}
}
}
}
DEVICE_INLINE void SharedData16Columns8(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
uint32_t column_id = stride ? threadIdx.x >> 5 : threadIdx.x & 0x1;
UNROLL
for (uint32_t i = 0; i < 8; i++) {
if (store) {
shmem[ntt_id * 16 + i * 2 + column_id] = X[i];
} else {
X[i] = shmem[ntt_id * 16 + i * 2 + column_id];
}
}
}
DEVICE_INLINE void SharedData16Rows8(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
uint32_t row_id = stride ? threadIdx.x >> 5 : threadIdx.x & 0x1;
UNROLL
for (uint32_t i = 0; i < 8; i++) {
if (store) {
shmem[ntt_id * 16 + row_id * 8 + i] = X[i];
} else {
X[i] = shmem[ntt_id * 16 + row_id * 8 + i];
}
}
}
DEVICE_INLINE void SharedData16Columns2_4(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
uint32_t column_id = (stride ? threadIdx.x >> 5 : threadIdx.x & 0x1) * 4;
UNROLL
for (uint32_t j = 0; j < 4; j++) {
UNROLL
for (uint32_t i = 0; i < 2; i++) {
if (store) {
shmem[ntt_id * 16 + i * 8 + column_id + j] = X[2 * j + i];
} else {
X[2 * j + i] = shmem[ntt_id * 16 + i * 8 + column_id + j];
}
}
}
}
DEVICE_INLINE void SharedData16Rows2_4(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
uint32_t row_id = (stride ? threadIdx.x >> 5 : threadIdx.x & 0x1) * 4;
UNROLL
for (uint32_t j = 0; j < 4; j++) {
UNROLL
for (uint32_t i = 0; i < 2; i++) {
if (store) {
shmem[ntt_id * 16 + row_id * 2 + 2 * j + i] = X[2 * j + i];
} else {
X[2 * j + i] = shmem[ntt_id * 16 + row_id * 2 + 2 * j + i];
}
}
}
}
DEVICE_INLINE void twiddlesInternal()
{
UNROLL
for (int i = 1; i < 8; i++) {
X[i] = X[i] * WI[i - 1];
}
}
DEVICE_INLINE void twiddlesExternal()
{
UNROLL
for (int i = 0; i < 8; i++) {
X[i] = X[i] * WE[i];
}
}
};
#endif

View File

@@ -0,0 +1,27 @@
set(TARGET icicle_poly)
set(CURVE_TARGET icicle_curve)
set(FIELD_TARGET icicle_field)
set(SRC ../)
set(POLY_SOURCE ${SRC}/polynomials/polynomials.cpp)
set(POLY_API_SOURCE ${SRC}/polynomials/polynomials_c_api.cpp)
# if(MSM)
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
# if(G2)
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
# endif()
# endif()
# if(ECNTT)
# list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
# list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
# endif()
add_library(${TARGET} STATIC ${POLY_SOURCE})
add_library(${TARGET} STATIC ${POLY_API_SOURCE})
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
target_compile_features(${TARGET} PUBLIC cxx_std_17)

View File

@@ -39,7 +39,7 @@ namespace polynomials {
/*============================== evaluate ==============================*/
template <typename T>
__device__ T pow(T base, int exp)
T pow(T base, int exp)
{
T result = T::one();
while (exp > 0) {

View File

@@ -0,0 +1,204 @@
#define FIELD_ID BN254
#include "../../include/polynomials/polynomials.h"
namespace polynomials {
template <typename C, typename D, typename I>
Polynomial<C, D, I>::Polynomial()
{
if (nullptr == s_factory) {
throw std::runtime_error("Polynomial factory not initialized. Must call Polynomial::initialize(factory)");
}
m_context = s_factory->create_context();
m_backend = s_factory->create_backend();
}
template <typename C, typename D, typename I>
Polynomial<C, D, I> Polynomial<C, D, I>::from_coefficients(const C* coefficients, uint64_t nof_coefficients)
{
Polynomial<C, D, I> P = {};
P.m_backend->from_coefficients(P.m_context, nof_coefficients, coefficients);
return P;
}
template <typename C, typename D, typename I>
Polynomial<C, D, I> Polynomial<C, D, I>::from_rou_evaluations(const I* evaluations, uint64_t nof_evaluations)
{
Polynomial<C, D, I> P = {};
P.m_backend->from_rou_evaluations(P.m_context, nof_evaluations, evaluations);
return P;
}
template <typename C, typename D, typename I>
Polynomial<C, D, I> Polynomial<C, D, I>::clone() const
{
Polynomial<C, D, I> P = {};
m_backend->clone(P.m_context, m_context);
return P;
}
template <typename C, typename D, typename I>
Polynomial<C, D, I> Polynomial<C, D, I>::slice(uint64_t offset, uint64_t stride, uint64_t size)
{
Polynomial res = {};
m_backend->slice(res.m_context, this->m_context, offset, stride, size);
return res;
}
template <typename C, typename D, typename I>
Polynomial<C, D, I> Polynomial<C, D, I>::even()
{
return slice(0, 2, 0 /*all elements*/);
}
template <typename C, typename D, typename I>
Polynomial<C, D, I> Polynomial<C, D, I>::odd()
{
return slice(1, 2, 0 /*all elements*/);
}
template <typename C, typename D, typename I>
Polynomial<C, D, I> Polynomial<C, D, I>::operator+(const Polynomial<C, D, I>& rhs) const
{
Polynomial<C, D, I> res = {};
m_backend->add(res.m_context, m_context, rhs.m_context);
return res;
}
template <typename C, typename D, typename I>
Polynomial<C, D, I> Polynomial<C, D, I>::operator-(const Polynomial<C, D, I>& rhs) const
{
Polynomial<C, D, I> res = {};
m_backend->subtract(res.m_context, m_context, rhs.m_context);
return res;
}
template <typename C, typename D, typename I>
Polynomial<C, D, I> Polynomial<C, D, I>::operator*(const Polynomial& rhs) const
{
Polynomial<C, D, I> res = {};
m_backend->multiply(res.m_context, m_context, rhs.m_context);
return res;
}
template <typename C, typename D, typename I>
Polynomial<C, D, I> Polynomial<C, D, I>::operator*(const D& scalar) const
{
Polynomial<C, D, I> res = {};
m_backend->multiply(res.m_context, m_context, scalar);
return res;
}
template <typename C, typename D, typename I>
Polynomial<C, D, I> operator*(const D& scalar, const Polynomial<C, D, I>& rhs)
{
return rhs * scalar;
}
template <typename C, typename D, typename I>
std::pair<Polynomial<C, D, I>, Polynomial<C, D, I>> Polynomial<C, D, I>::divide(const Polynomial<C, D, I>& rhs) const
{
Polynomial<C, D, I> Q = {}, R = {};
m_backend->divide(Q.m_context, R.m_context, m_context, rhs.m_context);
return std::make_pair(std::move(Q), std::move(R));
}
template <typename C, typename D, typename I>
Polynomial<C, D, I> Polynomial<C, D, I>::operator/(const Polynomial& rhs) const
{
Polynomial<C, D, I> res = {};
m_backend->quotient(res.m_context, m_context, rhs.m_context);
return res;
}
template <typename C, typename D, typename I>
Polynomial<C, D, I> Polynomial<C, D, I>::operator%(const Polynomial& rhs) const
{
Polynomial<C, D, I> res = {};
m_backend->remainder(res.m_context, m_context, rhs.m_context);
return res;
}
template <typename C, typename D, typename I>
Polynomial<C, D, I> Polynomial<C, D, I>::divide_by_vanishing_polynomial(uint64_t vanishing_polynomial_degree) const
{
Polynomial<C, D, I> res = {};
m_backend->divide_by_vanishing_polynomial(res.m_context, m_context, vanishing_polynomial_degree);
return res;
}
template <typename C, typename D, typename I>
Polynomial<C, D, I>& Polynomial<C, D, I>::operator+=(const Polynomial& rhs)
{
m_backend->add(m_context, m_context, rhs.m_context);
return *this;
}
template <typename C, typename D, typename I>
Polynomial<C, D, I>& Polynomial<C, D, I>::add_monomial_inplace(C monomial_coeff, uint64_t monomial)
{
m_backend->add_monomial_inplace(m_context, monomial_coeff, monomial);
return *this;
}
template <typename C, typename D, typename I>
Polynomial<C, D, I>& Polynomial<C, D, I>::sub_monomial_inplace(C monomial_coeff, uint64_t monomial)
{
m_backend->sub_monomial_inplace(m_context, monomial_coeff, monomial);
return *this;
}
template <typename C, typename D, typename I>
I Polynomial<C, D, I>::operator()(const D& x) const
{
I eval = {};
evaluate(&x, &eval);
return eval;
}
template <typename C, typename D, typename I>
void Polynomial<C, D, I>::evaluate(const D* x, I* eval) const
{
m_backend->evaluate(m_context, x, eval);
}
template <typename C, typename D, typename I>
void Polynomial<C, D, I>::evaluate_on_domain(D* domain, uint64_t size, I* evals /*OUT*/) const
{
return m_backend->evaluate_on_domain(m_context, domain, size, evals);
}
template <typename C, typename D, typename I>
int64_t Polynomial<C, D, I>::degree()
{
return m_backend->degree(m_context);
}
template <typename C, typename D, typename I>
C Polynomial<C, D, I>::get_coeff(uint64_t idx) const
{
return m_backend->get_coeff(m_context, idx);
}
template <typename C, typename D, typename I>
uint64_t Polynomial<C, D, I>::copy_coeffs(C* host_coeffs, uint64_t start_idx, uint64_t end_idx) const
{
return m_backend->copy_coeffs(m_context, host_coeffs, start_idx, end_idx);
}
template <typename C, typename D, typename I>
std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/>
Polynomial<C, D, I>::get_coefficients_view()
{
return m_backend->get_coefficients_view(m_context);
}
template <typename C, typename D, typename I>
std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
Polynomial<C, D, I>::get_rou_evaluations_view(uint64_t nof_evaluations, bool is_reversed)
{
return m_backend->get_rou_evaluations_view(m_context, nof_evaluations, is_reversed);
}
// explicit instantiation for default type (scalar field)
template class Polynomial<scalar_t>;
template Polynomial<scalar_t> operator*(const scalar_t& c, const Polynomial<scalar_t>& rhs);
} // namespace polynomials

View File

@@ -0,0 +1,284 @@
#define FIELD_ID BN254
#include "../../include/polynomials/polynomials.h"
#include "../../include/fields/field_config.cuh"
#include "../../include/utils/utils.h"
#include "../../include/utils/integrity_pointer.h"
#include "../../include/polynomials/cuda_backend/polynomial_cuda_backend.cuh"
namespace polynomials {
extern "C" {
// Defines a polynomial instance based on the scalar type from the FIELD configuration.
typedef Polynomial<scalar_t> PolynomialInst;
bool CONCAT_EXPAND(FIELD, polynomial_init_cuda_backend)()
{
static auto cuda_factory = std::make_shared<CUDAPolynomialFactory<scalar_t>>();
PolynomialInst::initialize(cuda_factory);
return cuda_factory != nullptr;
}
// Constructs a polynomial from a set of coefficients.
// coeffs: Array of coefficients.
// size: Number of coefficients in the array.
// Returns a pointer to the newly created polynomial instance.
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_create_from_coefficients)(scalar_t* coeffs, size_t size)
{
auto result = new PolynomialInst(PolynomialInst::from_coefficients(coeffs, size));
return result;
}
// Constructs a polynomial from evaluations at the roots of unity.
// evals: Array of evaluations.
// size: Number of evaluations in the array.
// Returns a pointer to the newly created polynomial instance.
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_create_from_rou_evaluations)(scalar_t* evals, size_t size)
{
auto result = new PolynomialInst(PolynomialInst::from_rou_evaluations(evals, size));
return result;
}
// Clones an existing polynomial instance.
// p: Pointer to the polynomial instance to clone.
// Returns a pointer to the cloned polynomial instance.
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_clone)(const PolynomialInst* p)
{
auto result = new PolynomialInst(p->clone());
return result;
}
// Deletes a polynomial instance, freeing its memory.
// instance: Pointer to the polynomial instance to delete.
void CONCAT_EXPAND(FIELD, polynomial_delete)(PolynomialInst* instance) { delete instance; }
// Prints a polynomial to stdout
void CONCAT_EXPAND(FIELD, polynomial_print(PolynomialInst* p)) { std::cout << *p << std::endl; }
// Adds two polynomials.
// a, b: Pointers to the polynomial instances to add.
// Returns a pointer to the resulting polynomial instance.
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_add)(const PolynomialInst* a, const PolynomialInst* b)
{
auto result = new PolynomialInst(std::move(*a + *b));
return result;
}
// Adds a polynomial to another in place.
// a: Pointer to the polynomial to add to.
// b: Pointer to the polynomial to add.
void CONCAT_EXPAND(FIELD, polynomial_add_inplace)(PolynomialInst* a, const PolynomialInst* b) { *a += *b; }
// Subtracts one polynomial from another.
// a, b: Pointers to the polynomial instances (minuend and subtrahend, respectively).
// Returns a pointer to the resulting polynomial instance.
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_subtract)(const PolynomialInst* a, const PolynomialInst* b)
{
auto result = new PolynomialInst(std::move(*a - *b));
return result;
}
// Multiplies two polynomials.
// a, b: Pointers to the polynomial instances to multiply.
// Returns a pointer to the resulting polynomial instance.
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_multiply)(const PolynomialInst* a, const PolynomialInst* b)
{
auto result = new PolynomialInst(std::move(*a * *b));
return result;
}
// Multiplies a polynomial by scalar.
// a: Pointer to the polynomial instance.
// scalar: Scalar to multiply by.
// Returns a pointer to the resulting polynomial instance.
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_multiply_by_scalar)(const PolynomialInst* a, const scalar_t& scalar)
{
auto result = new PolynomialInst(std::move(*a * scalar));
return result;
}
// Divides one polynomial by another, returning both quotient and remainder.
// a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
// q: Output parameter for the quotient.
// r: Output parameter for the remainder.
void CONCAT_EXPAND(FIELD, polynomial_division)(
const PolynomialInst* a, const PolynomialInst* b, PolynomialInst** q /*OUT*/, PolynomialInst** r /*OUT*/)
{
auto [_q, _r] = a->divide(*b);
*q = new PolynomialInst(std::move(_q));
*r = new PolynomialInst(std::move(_r));
}
// Calculates the quotient of dividing one polynomial by another.
// a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
// Returns a pointer to the resulting quotient polynomial instance.
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_quotient)(const PolynomialInst* a, const PolynomialInst* b)
{
auto result = new PolynomialInst(std::move(*a / *b));
return result;
}
// Calculates the remainder of dividing one polynomial by another.
// a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
// Returns a pointer to the resulting remainder polynomial instance.
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_remainder)(const PolynomialInst* a, const PolynomialInst* b)
{
auto result = new PolynomialInst(std::move(*a % *b));
return result;
}
// Divides a polynomial by a vanishing polynomial of a given degree, over rou domain.
// p: Pointer to the polynomial instance.
// vanishing_poly_degree: Degree of the vanishing polynomial.
// Returns a pointer to the resulting polynomial instance.
PolynomialInst*
CONCAT_EXPAND(FIELD, polynomial_divide_by_vanishing)(const PolynomialInst* p, uint64_t vanishing_poly_degree)
{
auto result = new PolynomialInst(std::move(p->divide_by_vanishing_polynomial(vanishing_poly_degree)));
return result;
}
// Adds a monomial to a polynomial in place.
// p: Pointer to the polynomial instance.
// monomial_coeff: Coefficient of the monomial to add.
// monomial: Degree of the monomial to add.
void CONCAT_EXPAND(FIELD, polynomial_add_monomial_inplace)(
PolynomialInst* p, const scalar_t& monomial_coeff, uint64_t monomial)
{
p->add_monomial_inplace(monomial_coeff, monomial);
}
// Subtracts a monomial from a polynomial in place.
// p: Pointer to the polynomial instance.
// monomial_coeff: Coefficient of the monomial to subtract.
// monomial: Degree of the monomial to subtract.
void CONCAT_EXPAND(FIELD, polynomial_sub_monomial_inplace)(
PolynomialInst* p, const scalar_t& monomial_coeff, uint64_t monomial)
{
p->sub_monomial_inplace(monomial_coeff, monomial);
}
// Creates a new polynomial instance by slicing an existing polynomial.
// p: Pointer to the original polynomial instance to be sliced.
// offset: Starting index for the slice.
// stride: Interval between elements in the slice.
// size: Number of elements in the slice.
// Returns: Pointer to the new polynomial instance containing the slice.
PolynomialInst*
CONCAT_EXPAND(FIELD, polynomial_slice)(PolynomialInst* p, uint64_t offset, uint64_t stride, uint64_t size)
{
auto result = new PolynomialInst(std::move(p->slice(offset, stride, size)));
return result;
}
// Creates a new polynomial instance containing only the even-powered terms of the original polynomial.
// p: Pointer to the original polynomial instance.
// Returns: Pointer to the new polynomial instance containing only even-powered terms.
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_even)(PolynomialInst* p)
{
auto result = new PolynomialInst(std::move(p->even()));
return result;
}
// Creates a new polynomial instance containing only the odd-powered terms of the original polynomial.
// p: Pointer to the original polynomial instance.
// Returns: Pointer to the new polynomial instance containing only odd-powered terms.
PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_odd)(PolynomialInst* p)
{
auto result = new PolynomialInst(std::move(p->odd()));
return result;
}
// Evaluates a polynomial on a domain of points.
// p: Pointer to the polynomial instance.
// domain: Array of points constituting the domain.
// domain_size: Number of points in the domain.
// evals: Output array for the evaluations.
void CONCAT_EXPAND(FIELD, polynomial_evaluate_on_domain)(
const PolynomialInst* p, scalar_t* domain, uint64_t domain_size, scalar_t* evals /*OUT*/)
{
return p->evaluate_on_domain(domain, domain_size, evals);
}
// Returns the degree of a polynomial.
// p: Pointer to the polynomial instance.
// Returns the degree of the polynomial.
int64_t CONCAT_EXPAND(FIELD, polynomial_degree)(PolynomialInst* p) { return p->degree(); }
// Copies a range of polynomial coefficients to host/device memory.
// p: Pointer to the polynomial instance.
// host_memory: Array to copy the coefficients into. If NULL, not copying.
// start_idx: Start index of the range to copy.
// end_idx: End index of the range to copy.
// Returns the number of coefficients copied. if memory is NULL, returns number of coefficients.
uint64_t CONCAT_EXPAND(FIELD, polynomial_copy_coeffs_range)(
PolynomialInst* p, scalar_t* memory, uint64_t start_idx, uint64_t end_idx)
{
return p->copy_coeffs(memory, start_idx, end_idx);
}
// Retrieves a device-memory raw-ptr of the polynomial coefficients.
// p: Pointer to the polynomial instance.
// size: Output parameter for the size of the view.
// device_id: Output parameter for the device ID.
// Returns a raw mutable pointer to the coefficients.
scalar_t* CONCAT_EXPAND(FIELD, polynomial_get_coeffs_raw_ptr)(
PolynomialInst* p, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
{
auto [coeffs, _size, _device_id] = p->get_coefficients_view();
*size = _size;
*device_id = _device_id;
return const_cast<scalar_t*>(coeffs.get());
}
// Retrieves a device-memory view of the polynomial coefficients.
// p: Pointer to the polynomial instance.
// size: Output parameter for the size of the view.
// device_id: Output parameter for the device ID.
// Returns a pointer to an integrity pointer encapsulating the coefficients view.
IntegrityPointer<scalar_t>* CONCAT_EXPAND(FIELD, polynomial_get_coeff_view)(
PolynomialInst* p, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
{
auto [coeffs, _size, _device_id] = p->get_coefficients_view();
*size = _size;
*device_id = _device_id;
return new IntegrityPointer<scalar_t>(std::move(coeffs));
}
// Retrieves a device-memory view of the polynomial's evaluations on the roots of unity.
// p: Pointer to the polynomial instance.
// nof_evals: Number of evaluations.
// is_reversed: Whether the evaluations are in reversed order.
// size: Output parameter for the size of the view.
// device_id: Output parameter for the device ID.
// Returns a pointer to an integrity pointer encapsulating the evaluations view.
IntegrityPointer<scalar_t>* CONCAT_EXPAND(FIELD, polynomial_get_rou_evaluations_view)(
PolynomialInst* p, uint64_t nof_evals, bool is_reversed, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
{
auto [rou_evals, _size, _device_id] = p->get_rou_evaluations_view(nof_evals, is_reversed);
*size = _size;
*device_id = _device_id;
return new IntegrityPointer<scalar_t>(std::move(rou_evals));
}
// Reads the pointer from an integrity pointer.
// p: Pointer to the integrity pointer.
// Returns the raw pointer if still valid, otherwise NULL.
const scalar_t* CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_get)(IntegrityPointer<scalar_t>* p)
{
return p->get();
}
// Checks if an integrity pointer is still valid.
// p: Pointer to the integrity pointer.
// Returns true if the pointer is valid, false otherwise.
bool CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_is_valid)(IntegrityPointer<scalar_t>* p) { return p->isValid(); }
// Destroys an integrity pointer, freeing its resources.
// p: Pointer to the integrity pointer to destroy.
void CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_destroy)(IntegrityPointer<scalar_t>* p) { delete p; }
} // extern "C"
} // namespace polynomials

View File

@@ -0,0 +1,25 @@
set(TARGET icicle_poseidon)
set(CURVE_TARGET icicle_curve)
set(FIELD_TARGET icicle_field)
set(SRC ../)
set(POLY_SOURCE ${SRC}/poseidon/poseidon.cpp)
# if(MSM)
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
# if(G2)
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
# endif()
# endif()
# if(ECNTT)
# list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
# list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
# endif()
add_library(${TARGET} STATIC ${POLY_SOURCE})
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
target_compile_features(${TARGET} PUBLIC cxx_std_17)

View File

@@ -1,21 +1,21 @@
#include "poseidon/poseidon.cuh"
#include "../../include/poseidon/poseidon.cuh"
/// These are pre-calculated constants for different curves
#include "fields/id.h"
#include "../../include/fields/id.h"
#if FIELD_ID == BN254
#include "poseidon/constants/bn254_poseidon.h"
#include "../../include/poseidon/constants/bn254_poseidon.h"
using namespace poseidon_constants_bn254;
#elif FIELD_ID == BLS12_381
#include "poseidon/constants/bls12_381_poseidon.h"
#include "../../include/poseidon/constants/bls12_381_poseidon.h"
using namespace poseidon_constants_bls12_381;
#elif FIELD_ID == BLS12_377
#include "poseidon/constants/bls12_377_poseidon.h"
#include "../../include/poseidon/constants/bls12_377_poseidon.h"
using namespace poseidon_constants_bls12_377;
#elif FIELD_ID == BW6_761
#include "poseidon/constants/bw6_761_poseidon.h"
#include "../../include/poseidon/constants/bw6_761_poseidon.h"
using namespace poseidon_constants_bw6_761;
#elif FIELD_ID == GRUMPKIN
#include "poseidon/constants/grumpkin_poseidon.h"
#include "../../include/poseidon/constants/grumpkin_poseidon.h"
using namespace poseidon_constants_grumpkin;
#endif
@@ -29,8 +29,8 @@ namespace poseidon {
device_context::DeviceContext& ctx,
PoseidonConstants<S>* poseidon_constants)
{
CHK_INIT_IF_RETURN();
cudaStream_t& stream = ctx.stream;
// CHK_INIT_IF_RETURN();
int& stream = ctx.stream;
int width = arity + 1;
int round_constants_len = width * full_rounds_half * 2 + partial_rounds;
int mds_matrix_len = width * width;
@@ -39,10 +39,10 @@ namespace poseidon {
// Malloc memory for copying constants
S* d_constants;
CHK_IF_RETURN(cudaMallocAsync(&d_constants, sizeof(S) * constants_len, stream));
// CHK_IF_RETURN(cudaMallocAsync(&d_constants, sizeof(S) * constants_len, stream));
// Copy constants
CHK_IF_RETURN(cudaMemcpyAsync(d_constants, constants, sizeof(S) * constants_len, cudaMemcpyHostToDevice, stream));
// CHK_IF_RETURN(cudaMemcpyAsync(d_constants, constants, sizeof(S) * constants_len, cudaMemcpyHostToDevice, stream));
S* round_constants = d_constants;
S* mds_matrix = round_constants + round_constants_len;
@@ -56,18 +56,18 @@ namespace poseidon {
S domain_tag = S::from(tree_domain_tag_value);
// Make sure all the constants have been copied
CHK_IF_RETURN(cudaStreamSynchronize(stream));
// CHK_IF_RETURN(cudaStreamSynchronize(stream));
*poseidon_constants = {arity, partial_rounds, full_rounds_half, round_constants,
mds_matrix, non_sparse_matrix, sparse_matrices, domain_tag};
return CHK_LAST();
return 0;
}
template <typename S>
cudaError_t init_optimized_poseidon_constants(
int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* poseidon_constants)
{
CHK_INIT_IF_RETURN();
//CHK_INIT_IF_RETURN();
int full_rounds_half = FULL_ROUNDS_DEFAULT;
int partial_rounds;
unsigned char* constants;
@@ -96,7 +96,7 @@ namespace poseidon {
create_optimized_poseidon_constants(arity, full_rounds_half, partial_rounds, h_constants, ctx, poseidon_constants);
return CHK_LAST();
return 0; //CHK_LAST();
}
extern "C" cudaError_t CONCAT_EXPAND(FIELD, create_optimized_poseidon_constants_cuda)(

View File

@@ -1,38 +1,15 @@
#include "poseidon/poseidon.cuh"
#include "gpu-utils/modifiers.cuh"
#include "../../include/poseidon/poseidon.cuh"
#include "../../include/gpu-utils/modifiers.cuh"
namespace poseidon {
template <typename S, int T>
__global__ void prepare_poseidon_states(S* states, size_t number_of_states, S domain_tag, bool aligned)
void prepare_poseidon_states(S* states, size_t number_of_states, S domain_tag, bool aligned)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int state_number = idx / T;
if (state_number >= number_of_states) { return; }
int element_number = idx % T;
S prepared_element;
// Domain separation
if (element_number == 0) {
prepared_element = domain_tag;
} else {
if (aligned) {
prepared_element = states[idx];
} else {
prepared_element = states[idx - 1];
}
}
// We need __syncthreads here if the state is not aligned
// because then we need to shift the vector [A, B, 0] -> [D, A, B]
if (!aligned) { __syncthreads(); }
// Store element in state
states[idx] = prepared_element;
return;
}
template <typename S>
DEVICE_INLINE S sbox_alpha_five(S element)
S sbox_alpha_five(S element)
{
S result = S::sqr(element);
result = S::sqr(result);
@@ -40,23 +17,13 @@ namespace poseidon {
}
template <typename S, int T>
__device__ S vecs_mul_matrix(S element, S* matrix, int element_number, int vec_number, S* shared_states)
S vecs_mul_matrix(S element, S* matrix, int element_number, int vec_number, S* shared_states)
{
__syncthreads();
shared_states[threadIdx.x] = element;
__syncthreads();
typename S::Wide element_wide = S::mul_wide(shared_states[vec_number * T], matrix[element_number]);
UNROLL
for (int i = 1; i < T; i++) {
element_wide = element_wide + S::mul_wide(shared_states[vec_number * T + i], matrix[i * T + element_number]);
}
return S::reduce(element_wide);
return element;
}
template <typename S, int T>
__device__ S full_round(
S full_round(
S element,
size_t rc_offset,
int local_state_number,
@@ -67,110 +34,40 @@ namespace poseidon {
S* shared_states,
const PoseidonConstants<S>& constants)
{
if (add_pre_round_constants) {
element = element + constants.round_constants[rc_offset + element_number];
rc_offset += T;
}
element = sbox_alpha_five(element);
if (!skip_rc) { element = element + constants.round_constants[rc_offset + element_number]; }
// Multiply all the states by mds matrix
S* matrix = multiply_by_mds ? constants.mds_matrix : constants.non_sparse_matrix;
return vecs_mul_matrix<S, T>(element, matrix, element_number, local_state_number, shared_states);
return element;
}
template <typename S, int T>
__global__ void full_rounds(
void full_rounds(
S* states, size_t number_of_states, size_t rc_offset, bool first_half, const PoseidonConstants<S> constants)
{
extern __shared__ S shared_states[];
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
int state_number = idx / T;
if (state_number >= number_of_states) { return; }
int local_state_number = threadIdx.x / T;
int element_number = idx % T;
S new_el = states[idx];
bool add_pre_round_constants = first_half;
for (int i = 0; i < constants.full_rounds_half; i++) {
new_el = full_round<S, T>(
new_el, rc_offset, local_state_number, element_number, !first_half || (i < (constants.full_rounds_half - 1)),
add_pre_round_constants, !first_half && (i == constants.full_rounds_half - 1), shared_states, constants);
rc_offset += T;
if (add_pre_round_constants) {
rc_offset += T;
add_pre_round_constants = false;
}
}
states[idx] = new_el;
return;
}
template <typename S, int T>
__device__ S partial_round(S state[T], size_t rc_offset, int round_number, const PoseidonConstants<S>& constants)
S partial_round(S state[T], size_t rc_offset, int round_number, const PoseidonConstants<S>& constants)
{
S element = state[0];
element = sbox_alpha_five(element);
element = element + constants.round_constants[rc_offset];
S* sparse_matrix = &constants.sparse_matrices[(T * 2 - 1) * round_number];
typename S::Wide state_0_wide = S::mul_wide(element, sparse_matrix[0]);
UNROLL
for (int i = 1; i < T; i++) {
state_0_wide = state_0_wide + S::mul_wide(state[i], sparse_matrix[i]);
}
state[0] = S::reduce(state_0_wide);
UNROLL
for (int i = 1; i < T; i++) {
state[i] = state[i] + (element * sparse_matrix[T + i - 1]);
}
return element;
}
template <typename S, int T>
__global__ void
void
partial_rounds(S* states, size_t number_of_states, size_t rc_offset, const PoseidonConstants<S> constants)
{
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
if (idx >= number_of_states) { return; }
S state[T];
UNROLL
for (int i = 0; i < T; i++) {
state[i] = states[idx * T + i];
}
for (int i = 0; i < constants.partial_rounds; i++) {
partial_round<S, T>(state, rc_offset, i, constants);
rc_offset++;
}
UNROLL
for (int i = 0; i < T; i++) {
states[idx * T + i] = state[i];
}
return;
}
// These function is just doing copy from the states to the output
template <typename S, int T>
__global__ void get_hash_results(S* states, size_t number_of_states, S* out)
void get_hash_results(S* states, size_t number_of_states, S* out)
{
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
if (idx >= number_of_states) { return; }
out[idx] = states[idx * T + 1];
return;
}
template <typename S, int T>
__global__ void copy_recursive(S* state, size_t number_of_states, S* out)
void copy_recursive(S* state, size_t number_of_states, S* out)
{
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
if (idx >= number_of_states) { return; }
state[(idx / (T - 1) * T) + (idx % (T - 1)) + 1] = out[idx];
return;
}
} // namespace poseidon

View File

@@ -0,0 +1,47 @@
#define FIELD_ID BN254
#include "../../include/fields/field_config.cuh"
using namespace field_config;
#include "../../include/poseidon/poseidon.cuh"
#include "constants.cu"
#include "kernels.cu"
typedef int cudaError_t;
namespace poseidon {
template <typename S, int T>
cudaError_t
permute_many(S* states, size_t number_of_states, const PoseidonConstants<S>& constants, int& stream)
{
return 0;
}
template <typename S, int T>
cudaError_t poseidon_hash(
S* input, S* output, size_t number_of_states, const PoseidonConstants<S>& constants, const PoseidonConfig& config)
{
return 0;
}
extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_hash_cuda)(
scalar_t* input,
scalar_t* output,
int number_of_states,
int arity,
const PoseidonConstants<scalar_t>& constants,
PoseidonConfig& config)
{
switch (arity) {
case 2:
return poseidon_hash<scalar_t, 3>(input, output, number_of_states, constants, config);
case 4:
return poseidon_hash<scalar_t, 5>(input, output, number_of_states, constants, config);
case 8:
return poseidon_hash<scalar_t, 9>(input, output, number_of_states, constants, config);
case 11:
return poseidon_hash<scalar_t, 12>(input, output, number_of_states, constants, config);
default:
THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "PoseidonHash: #arity must be one of [2, 4, 8, 11]");
}
return 0;
}
} // namespace poseidon

View File

@@ -0,0 +1,27 @@
set(TARGET icicle_vec_ops)
set(CURVE_TARGET icicle_curve)
set(FIELD_TARGET icicle_field)
set(SRC ../)
set(VEC_OPS_SOURCE ${SRC}/vec_ops/extern.cpp)
set(VEC_OPS_EXT_SOURCE ${SRC}/vec_ops/extern_extension.cpp)
# if(MSM)
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern.cpp)
# if(G2)
# list(APPEND CURVE_SOURCE ${SRC}/msm/extern_g2.cpp)
# endif()
# endif()
# if(ECNTT)
# list(APPEND CURVE_SOURCE ${SRC}/ntt/extern_ecntt.cpp)
# list(APPEND CURVE_SOURCE ${SRC}/ntt/kernel_ntt.cpp)
# endif()
add_library(${TARGET} STATIC ${VEC_OPS_SOURCE})
add_library(${TARGET} STATIC ${VEC_OPS_EXT_SOURCE})
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_curve_${CURVE}")
target_compile_definitions(${TARGET} PUBLIC CURVE=${CURVE})
target_link_libraries(${TARGET} PRIVATE ${FIELD_TARGET})
target_link_libraries(${TARGET} PRIVATE ${CURVE_TARGET})
target_compile_features(${TARGET} PUBLIC cxx_std_17)

View File

@@ -0,0 +1,63 @@
#define FIELD_ID BN254
#include "../../include/fields/field_config.cuh"
using namespace field_config;
#include "../../include/utils/utils.h"
#include "vec_ops.cu"
namespace vec_ops {
/**
* Extern version of [Mul](@ref Mul) function with the template parameters
* `S` and `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
* or standalone "STARK field" given by `-DFIELD`).
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t
CONCAT_EXPAND(FIELD, mul_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
{
return mul<scalar_t>(vec_a, vec_b, n, config, result);
}
/**
* Extern version of [Add](@ref Add) function with the template parameter
* `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
* or standalone "STARK field" given by `-DFIELD`).
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t
CONCAT_EXPAND(FIELD, add_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
{
return add<scalar_t>(vec_a, vec_b, n, config, result);
}
/**
* Extern version of [Sub](@ref Sub) function with the template parameter
* `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
* or standalone "STARK field" given by `-DFIELD`).
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t
CONCAT_EXPAND(FIELD, sub_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config, scalar_t* result)
{
return sub<scalar_t>(vec_a, vec_b, n, config, result);
}
/**
* Extern version of transpose_batch function with the template parameter
* `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
* or standalone "STARK field" given by `-DFIELD`).
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(FIELD, transpose_matrix_cuda)(
const scalar_t* input,
uint32_t row_size,
uint32_t column_size,
scalar_t* output,
device_context::DeviceContext& ctx,
bool on_device,
bool is_async)
{
return transpose_matrix<scalar_t>(input, output, row_size, column_size, ctx, on_device, is_async);
}
} // namespace vec_ops

View File

@@ -0,0 +1,60 @@
#define FIELD_ID BABY_BEAR
#include "../../include/fields/field_config.cuh"
using namespace field_config;
#include "../../include/utils/utils.h"
#include "vec_ops.cu"
namespace vec_ops {
/**
* Extern version of [Mul](@ref Mul) function with the template parameters
* `S` and `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable
* during build.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_mul_cuda)(
extension_t* vec_a, extension_t* vec_b, int n, VecOpsConfig& config, extension_t* result)
{
return mul<extension_t>(vec_a, vec_b, n, config, result);
}
/**
* Extern version of [Add](@ref Add) function with the template parameter
* `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_add_cuda)(
extension_t* vec_a, extension_t* vec_b, int n, VecOpsConfig& config, extension_t* result)
{
return add<extension_t>(vec_a, vec_b, n, config, result);
}
/**
* Extern version of [Sub](@ref Sub) function with the template parameter
* `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_sub_cuda)(
extension_t* vec_a, extension_t* vec_b, int n, VecOpsConfig& config, extension_t* result)
{
return sub<extension_t>(vec_a, vec_b, n, config, result);
}
/**
* Extern version of transpose_batch function with the template parameter
* `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_transpose_matrix_cuda)(
const extension_t* input,
uint32_t row_size,
uint32_t column_size,
extension_t* output,
device_context::DeviceContext& ctx,
bool on_device,
bool is_async)
{
return transpose_matrix<extension_t>(input, output, row_size, column_size, ctx, on_device, is_async);
}
} // namespace vec_ops

View File

@@ -1,9 +1,8 @@
#include <cuda.h>
#include <stdexcept>
#include "vec_ops/vec_ops.cuh"
#include "gpu-utils/device_context.cuh"
#include "utils/mont.cuh"
#include "../../include/vec_ops/vec_ops.cuh"
#include "../../include/gpu-utils/device_context.cuh"
// #include "utils/mont.cuh"
namespace vec_ops {
@@ -12,97 +11,47 @@ namespace vec_ops {
#define MAX_THREADS_PER_BLOCK 256
template <typename E>
__global__ void mul_kernel(const E* scalar_vec, const E* element_vec, int n, E* result)
void mul_kernel(const E* scalar_vec, const E* element_vec, int n, E* result)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n) { result[tid] = scalar_vec[tid] * element_vec[tid]; }
return;
}
template <typename E, typename S>
__global__ void mul_scalar_kernel(const E* element_vec, const S scalar, int n, E* result)
void mul_scalar_kernel(const E* element_vec, const S scalar, int n, E* result)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) { result[tid] = element_vec[tid] * (scalar); }
return;
}
template <typename E>
__global__ void div_element_wise_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
void div_element_wise_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
{
// TODO:implement better based on https://eprint.iacr.org/2008/199
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) { result[tid] = element_vec1[tid] * E::inverse(element_vec2[tid]); }
return;
}
template <typename E>
__global__ void add_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
void add_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) { result[tid] = element_vec1[tid] + element_vec2[tid]; }
return;
}
template <typename E>
__global__ void sub_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
void sub_kernel(const E* element_vec1, const E* element_vec2, int n, E* result)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) { result[tid] = element_vec1[tid] - element_vec2[tid]; }
return;
}
template <typename E>
__global__ void transpose_kernel(const E* in, E* out, uint32_t row_size, uint32_t column_size)
void transpose_kernel(const E* in, E* out, uint32_t row_size, uint32_t column_size)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= row_size * column_size) return;
out[(tid % row_size) * column_size + (tid / row_size)] = in[tid];
return;
}
} // namespace
typedef int cudaError_t;
template <typename E, void (*Kernel)(const E*, const E*, int, E*)>
cudaError_t vec_op(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
{
CHK_INIT_IF_RETURN();
// Set the grid and block dimensions
int num_threads = MAX_THREADS_PER_BLOCK;
int num_blocks = (n + num_threads - 1) / num_threads;
E *d_result, *d_alloc_vec_a, *d_alloc_vec_b;
const E *d_vec_a, *d_vec_b;
if (!config.is_a_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
d_vec_a = d_alloc_vec_a;
} else {
d_vec_a = vec_a;
}
if (!config.is_b_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
d_vec_b = d_alloc_vec_b;
} else {
d_vec_b = vec_b;
}
if (!config.is_result_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
} else {
d_result = result;
}
// Call the kernel to perform element-wise operation
Kernel<<<num_blocks, num_threads, 0, config.ctx.stream>>>(d_vec_a, d_vec_b, n, d_result);
if (!config.is_a_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
if (!config.is_result_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream));
}
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream));
return CHK_LAST();
return 0;
}
template <typename E>
@@ -133,35 +82,6 @@ namespace vec_ops {
bool on_device,
bool is_async)
{
int number_of_threads = MAX_THREADS_PER_BLOCK;
int number_of_blocks = (row_size * column_size + number_of_threads - 1) / number_of_threads;
cudaStream_t stream = ctx.stream;
const E* d_mat_in;
E* d_allocated_input = nullptr;
E* d_mat_out;
if (!on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_input, row_size * column_size * sizeof(E), ctx.stream));
CHK_IF_RETURN(cudaMemcpyAsync(
d_allocated_input, mat_in, row_size * column_size * sizeof(E), cudaMemcpyHostToDevice, ctx.stream));
CHK_IF_RETURN(cudaMallocAsync(&d_mat_out, row_size * column_size * sizeof(E), ctx.stream));
d_mat_in = d_allocated_input;
} else {
d_mat_in = mat_in;
d_mat_out = mat_out;
}
transpose_kernel<<<number_of_blocks, number_of_threads, 0, stream>>>(d_mat_in, d_mat_out, row_size, column_size);
if (!on_device) {
CHK_IF_RETURN(
cudaMemcpyAsync(mat_out, d_mat_out, row_size * column_size * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream));
CHK_IF_RETURN(cudaFreeAsync(d_mat_out, ctx.stream));
CHK_IF_RETURN(cudaFreeAsync(d_allocated_input, ctx.stream));
}
if (!is_async) return CHK_STICKY(cudaStreamSynchronize(ctx.stream));
return CHK_LAST();
return 0;
}
} // namespace vec_ops

View File

@@ -514,20 +514,20 @@ namespace msm {
class dummy_g2_t : public scalar_t
{
public:
static constexpr __host__ __device__ dummy_g2_t to_affine(const dummy_g2_t& point) { return point; }
static constexpr __host__ dummy_g2_t to_affine(const dummy_g2_t& point) { return point; }
static constexpr __host__ __device__ dummy_g2_t from_affine(const dummy_g2_t& point) { return point; }
static constexpr __host__ dummy_g2_t from_affine(const dummy_g2_t& point) { return point; }
static constexpr __host__ __device__ dummy_g2_t generator() { return dummy_g2_t{scalar_t::one()}; }
static constexpr __host__ dummy_g2_t generator() { return dummy_g2_t{scalar_t::one()}; }
static __host__ __device__ dummy_g2_t zero() { return dummy_g2_t{scalar_t::zero()}; }
static __host__ dummy_g2_t zero() { return dummy_g2_t{scalar_t::zero()}; }
friend __host__ __device__ dummy_g2_t operator*(const scalar_t& xs, const dummy_g2_t& ys)
friend __host__ dummy_g2_t operator*(const scalar_t& xs, const dummy_g2_t& ys)
{
return dummy_g2_t{scalar_t::reduce(scalar_t::mul_wide(xs, ys))};
}
friend __host__ __device__ dummy_g2_t operator+(const dummy_g2_t& xs, const dummy_g2_t& ys)
friend __host__ dummy_g2_t operator+(const dummy_g2_t& xs, const dummy_g2_t& ys)
{
scalar_t rs = {};
scalar_t::add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);