mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-11 08:28:15 -05:00
Compare commits
7 Commits
v2.2.0
...
mod_mult_o
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a2f221dda1 | ||
|
|
c166b6d4d5 | ||
|
|
ecc3970c12 | ||
|
|
a20f603f6f | ||
|
|
32a178bc27 | ||
|
|
8d094bd5fb | ||
|
|
08c34a5183 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -11,3 +11,4 @@
|
||||
**/__pycache__/
|
||||
**/.DS_Store
|
||||
**/Cargo.lock
|
||||
**/icicle/build/
|
||||
|
||||
@@ -41,7 +41,7 @@ nvcc -o build/<ENTER_DIR_NAME> ./icicle/appUtils/ntt/ntt.cu ./icicle/appUtils/ms
|
||||
|
||||
### Testing the CUDA code
|
||||
|
||||
We are using [googletest] library for testing. To build and run the test suite for finite field and elliptic curve arithmetic, run from the `icicle` folder:
|
||||
We are using [googletest] library for testing. To build and run [the test suite](./icicle/README.md) for finite field and elliptic curve arithmetic, run from the `icicle` folder:
|
||||
|
||||
```sh
|
||||
mkdir -p build
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.14)
|
||||
project(icicle)
|
||||
cmake_minimum_required(VERSION 3.16)
|
||||
|
||||
# GoogleTest requires at least C++14
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
@@ -9,9 +8,9 @@ set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
|
||||
# add the target cuda architectures
|
||||
# each additional architecture increases the compilation time and output file size
|
||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||
set(CMAKE_CUDA_ARCHITECTURES 80)
|
||||
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
|
||||
endif ()
|
||||
project(bellman-cuda LANGUAGES CUDA CXX)
|
||||
project(icicle LANGUAGES CUDA CXX)
|
||||
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
|
||||
set(CMAKE_CUDA_FLAGS_RELEASE "")
|
||||
@@ -23,6 +22,10 @@ FetchContent_Declare(
|
||||
URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip
|
||||
)
|
||||
# For Windows: Prevent overriding the parent project's compiler/linker settings
|
||||
|
||||
# boosting lib
|
||||
include_directories("/home/miner/include/boost_1_80_0")
|
||||
|
||||
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
||||
FetchContent_MakeAvailable(googletest)
|
||||
|
||||
|
||||
81
icicle/README.md
Normal file
81
icicle/README.md
Normal file
@@ -0,0 +1,81 @@
|
||||
# Tests
|
||||
|
||||
```sh
|
||||
mkdir -p build; cmake -S . -B build; cmake --build build; cd build && ctest; cd ..
|
||||
```
|
||||
|
||||
## Prerequisites on Ubuntu
|
||||
|
||||
Before proceeding, make sure the following software installed:
|
||||
|
||||
1. CMake at least version 3.16, which can be downloaded from [cmake.org](https://cmake.org/files/)
|
||||
It is recommended to have the latest version installed.
|
||||
2. [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu) version 12.0 or newer.
|
||||
3. GCC - version 9 or newer recommended.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
In case you encounter problems during the build, please follow the points below to troubleshoot:
|
||||
|
||||
### 1 - Check CMake log files
|
||||
|
||||
If there are issues with the CMake configuration, please check the logs which are located in the `./build/CMakeFiles` directory. Depending on the version of CMake, the log file may have a different name. For example, for CMake version 3.20, one of log files is called `CMakeConfigureLog.yaml`.
|
||||
|
||||
### 2 - Check for conflicting GCC versions
|
||||
|
||||
Make sure that there are no conflicting versions of GCC installed. You can use the following commands to install and switch between different versions:
|
||||
|
||||
```sh
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /home/linuxbrew/.linuxbrew/bin/gcc-12 12
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /home/linuxbrew/.linuxbrew/bin/g++-12 12
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9
|
||||
```
|
||||
|
||||
Then you can select with the following command
|
||||
|
||||
```sh
|
||||
sudo update-alternatives --config gcc
|
||||
```
|
||||
|
||||
### 3 - Check for conflicting binaries in PATH
|
||||
|
||||
Make sure that there are no conflicting binaries in the PATH environment variable. For example, if `/home/linuxbrew/.linuxbrew/bin` precedes `/usr/bin/` in the PATH, it will override the `update-alternatives` settings.
|
||||
|
||||
### 4 - Add nvvm path to PATH
|
||||
|
||||
If you encounter the error `cicc not found`, make sure to add the nvvm path to PATH. For example, for CUDA version 12.1, the nvvm path is `/usr/local/cuda-12.1/nvvm/bin`.
|
||||
|
||||
### 5 - Add CUDA libraries to the project
|
||||
|
||||
If you encounter the error `Failed to extract nvcc implicit link line`, add the following code to the CMakeLists.txt file after enabling CUDA:
|
||||
|
||||
```c
|
||||
check_language(CUDA)
|
||||
if(CMAKE_CUDA_COMPILER)
|
||||
enable_language(CUDA)
|
||||
find_package(CUDAToolkit)
|
||||
target_link_libraries(project CUDA::cudart)
|
||||
target_link_libraries(project CUDA::cuda_driver)
|
||||
else()
|
||||
message(STATUS "No CUDA compiler found")
|
||||
endif()
|
||||
```
|
||||
|
||||
### 6 - Fix update alternatives
|
||||
|
||||
If the `update-alternatives` settings are broken, you can try to fix them with the following command:
|
||||
|
||||
`yes '' | update-alternatives --force --all`
|
||||
|
||||
### 7 - ..bin/crt/link.stub: No such file or directory
|
||||
|
||||
If you encounter the error, check if the `$CUDA_HOME/bin/crt/link.stub` file is available.
|
||||
|
||||
Othrewise create a symlink. For example, if the CUDA toolkit is installed with apt-get to the default path, you can create a symlink with the following command:
|
||||
|
||||
`ln -sf /usr/local/cuda-12.1/bin/crt/link.stub /usr/lib/nvidia-cuda-toolkit/bin/crt/link.stub`
|
||||
|
||||
Alternatively, you can replace the old CUDA root with a symlink to the new CUDA installation with the following command:
|
||||
|
||||
`ln -sf /usr/local/cuda-12.1/ /usr/lib/nvidia-cuda-toolkit/`
|
||||
@@ -245,7 +245,7 @@ template <class CONFIG> class Field {
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
// private:
|
||||
typedef storage<TLC> ff_storage;
|
||||
typedef storage<2*TLC> ff_wide_storage;
|
||||
|
||||
@@ -387,6 +387,14 @@ template <class CONFIG> class Field {
|
||||
return CARRY_OUT ? carry : 0;
|
||||
}
|
||||
|
||||
static constexpr HOST_INLINE uint32_t sub_limbs_partial_host(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs) {
|
||||
uint32_t carry = 0;
|
||||
host_math::carry_chain<2 * TLC, false, true> chain;
|
||||
for (unsigned i = 0; i < num_limbs; i++)
|
||||
r[i] = chain.sub(x[i], y[i], carry);
|
||||
return carry;
|
||||
}
|
||||
|
||||
template <bool CARRY_OUT, typename T> static constexpr HOST_DEVICE_INLINE uint32_t add_limbs(const T &xs, const T &ys, T &rs) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return add_sub_limbs_device<false, CARRY_OUT>(xs, ys, rs);
|
||||
@@ -411,7 +419,17 @@ template <class CONFIG> class Field {
|
||||
}
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void mul_n_msb(uint32_t *acc, const uint32_t *a, uint32_t bi, size_t n = TLC, size_t start_i = 0) {
|
||||
#pragma unroll
|
||||
for (size_t i = start_i; i < n; i += 2) {
|
||||
acc[i] = ptx::mul_lo(a[i], bi);
|
||||
acc[i + 1] = ptx::mul_hi(a[i], bi);
|
||||
}
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void cmad_n(uint32_t *acc, const uint32_t *a, uint32_t bi, size_t n = TLC) {
|
||||
// multiply scalar by vector
|
||||
// acc = acc + bi*A[::2]
|
||||
acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
|
||||
acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]);
|
||||
#pragma unroll
|
||||
@@ -421,7 +439,21 @@ template <class CONFIG> class Field {
|
||||
}
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void cmad_n_msb(uint32_t *acc, const uint32_t *a, uint32_t bi, size_t n = TLC, size_t a_start_idx=0) {
|
||||
// multiply scalar by vector
|
||||
// acc = acc + bi*A[::2]
|
||||
acc[a_start_idx] = ptx::mad_lo_cc(a[a_start_idx], bi, acc[a_start_idx]);
|
||||
acc[a_start_idx + 1] = ptx::madc_hi_cc(a[a_start_idx], bi, acc[a_start_idx + 1]);
|
||||
#pragma unroll
|
||||
for (size_t i = a_start_idx + 2; i < n; i += 2) {
|
||||
acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
|
||||
acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
|
||||
}
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void mad_row(uint32_t *odd, uint32_t *even, const uint32_t *a, uint32_t bi, size_t n = TLC) {
|
||||
// odd = odd + bi*A
|
||||
// even = even + bi*A
|
||||
cmad_n(odd, a + 1, bi, n - 2);
|
||||
odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0);
|
||||
odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0);
|
||||
@@ -429,6 +461,16 @@ template <class CONFIG> class Field {
|
||||
odd[n - 1] = ptx::addc(odd[n - 1], 0);
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void mad_row_msb(uint32_t *odd, uint32_t *even, const uint32_t *a, uint32_t bi, size_t n = TLC, size_t a_start_idx = 0) {
|
||||
// odd = odd + bi*A
|
||||
// even = even + bi*A
|
||||
cmad_n_msb(odd, a + 1, bi, n - 2, a_start_idx - 1);
|
||||
odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0);
|
||||
odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0);
|
||||
cmad_n_msb(even, a, bi, n, a_start_idx);
|
||||
odd[n - 1] = ptx::addc(odd[n - 1], 0);
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void multiply_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
const uint32_t *a = as.limbs;
|
||||
const uint32_t *b = bs.limbs;
|
||||
@@ -450,13 +492,289 @@ template <class CONFIG> class Field {
|
||||
even[i + 1] = ptx::addc(even[i + 1], 0);
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void mult_no_carry(uint32_t a, uint32_t b, uint32_t *r) {
|
||||
r[0] = ptx::mul_lo(a, b);
|
||||
r[1] = ptx::mul_hi(a, b);
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void ingo_multiply_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
const uint32_t *a = as.limbs;
|
||||
const uint32_t *b = bs.limbs;
|
||||
uint32_t *r = rs.limbs;
|
||||
uint32_t i, j;
|
||||
uint32_t *even = rs.limbs;
|
||||
__align__(8) uint32_t odd[2 * TLC];
|
||||
for (uint32_t i = 0; i < 2 * TLC; i++)
|
||||
{
|
||||
even[i] = 0;
|
||||
odd[i] = 0;
|
||||
}
|
||||
// first row special case, no carry in no carry out. split to non parts, even and odd.
|
||||
for (i = 0; i < TLC - 1; i+=2 )
|
||||
{
|
||||
mult_no_carry(b[0], a[i], &even[i]);
|
||||
mult_no_carry(b[0], a[i + 1], &odd[i]);
|
||||
}
|
||||
|
||||
// doing two rows at one loop
|
||||
for (i = 1; i < TLC - 1; i+=2)
|
||||
{
|
||||
// odd bi's
|
||||
// multiply accumulate even part of new row with odd part prev row (needs a carry)
|
||||
// // j = 0, no carry in, only carry out
|
||||
odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
|
||||
odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]);
|
||||
// for loop carry in carry out
|
||||
for (j = 2; j < TLC; j+=2) // 2, 4, 6
|
||||
{
|
||||
odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
|
||||
odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
|
||||
}
|
||||
odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
|
||||
|
||||
// multiply accumulate odd part of new row with even part prev row (doesnt need a carry)
|
||||
// j = 1, no carry in, only carry out
|
||||
even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
|
||||
even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]);
|
||||
// for loop carry in carry out
|
||||
for (j = 3; j < TLC; j+=2)
|
||||
{
|
||||
even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
|
||||
even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
|
||||
}
|
||||
|
||||
// even bi's
|
||||
// multiply accumulate even part of new row with even part of prev row // needs a carry
|
||||
// j = 0, no carry in, only carry out
|
||||
even[i + 1] = ptx::mad_lo_cc(a[0], b[i + 1], even[i + 1]);
|
||||
even[i + 2] = ptx::madc_hi_cc(a[0], b[i + 1], even[i + 2]);
|
||||
// for loop, carry in, carry out.
|
||||
for (j = 2; j < TLC; j+=2)
|
||||
{
|
||||
even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]);
|
||||
even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]);
|
||||
}
|
||||
even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry
|
||||
|
||||
// multiply accumulate odd part of new row with odd part of prev row
|
||||
// j = 1, no carry in, only carry out
|
||||
odd[i + 1] = ptx::mad_lo_cc(a[1], b[i + 1], odd[i + 1]);
|
||||
odd[i + 2] = ptx::madc_hi_cc(a[1], b[i + 1], odd[i + 2]);
|
||||
// for loop, carry in, carry out.
|
||||
for (j = 3; j < TLC; j+=2)
|
||||
{
|
||||
odd[i + j] = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]);
|
||||
odd[i + j + 1] = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
|
||||
odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]);
|
||||
// for loop carry in carry out
|
||||
for (j = 2; j < TLC; j+=2)
|
||||
{
|
||||
odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
|
||||
odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
|
||||
}
|
||||
odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
|
||||
|
||||
// multiply accumulate odd part of new row with even part prev row
|
||||
// j = 1, no carry in, only carry out
|
||||
even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
|
||||
even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]);
|
||||
// for loop carry in carry out
|
||||
for (j = 3; j < TLC; j+=2)
|
||||
{
|
||||
even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
|
||||
even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
|
||||
}
|
||||
|
||||
// add even and odd parts
|
||||
even[1] = ptx::add_cc(even[1], odd[0]);
|
||||
for (i = 1; i < 2 * TLC - 2; i++)
|
||||
even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
|
||||
even[i + 1] = ptx::addc(even[i + 1], 0);
|
||||
}
|
||||
|
||||
|
||||
static DEVICE_INLINE void ingo_msb_multiply_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
const uint32_t *a = as.limbs;
|
||||
const uint32_t *b = bs.limbs;
|
||||
uint32_t *r = rs.limbs;
|
||||
uint32_t i, j;
|
||||
uint32_t *even = rs.limbs;
|
||||
__align__(8) uint32_t odd[2 * TLC];
|
||||
for (uint32_t i = 0; i < 2 * TLC; i++)
|
||||
{
|
||||
even[i] = 0;
|
||||
odd[i] = 0;
|
||||
}
|
||||
// only last element from first row.
|
||||
mult_no_carry(b[0], a[TLC - 1], &odd[TLC - 2]);
|
||||
|
||||
// doing two rows at one loop
|
||||
#pragma unroll
|
||||
for (i = 1; i < TLC - 1; i+=2)
|
||||
{
|
||||
const uint32_t first_active_j = TLC - 1 - i;
|
||||
const uint32_t first_active_j_odd = first_active_j + (1 - (first_active_j % 2));
|
||||
const uint32_t first_active_j_even = first_active_j + first_active_j % 2 ;
|
||||
// odd bi's
|
||||
// multiply accumulate even part of new row with odd part prev row (needs a carry)
|
||||
// j = 0, no carry in, only carry out
|
||||
odd[first_active_j_even + i - 1] = ptx::mad_lo_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i - 1]);
|
||||
odd[first_active_j_even + i] = ptx::madc_hi_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i]);
|
||||
// for loop carry in carry out
|
||||
#pragma unroll
|
||||
for (j = first_active_j_even + 2; j < TLC; j+=2)
|
||||
{
|
||||
odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
|
||||
odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
|
||||
}
|
||||
odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
|
||||
|
||||
// multiply accumulate odd part of new row with even part prev row (doesnt need a carry)
|
||||
// j = 1, no carry in, only carry out
|
||||
even[i + first_active_j_odd] = ptx::mad_lo_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd]);
|
||||
even[i + first_active_j_odd + 1] = ptx::madc_hi_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd + 1]);
|
||||
// for loop carry in carry out
|
||||
#pragma unroll
|
||||
for (j = first_active_j_odd + 2; j < TLC; j+=2)
|
||||
{
|
||||
even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
|
||||
even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
|
||||
}
|
||||
|
||||
// even bi's
|
||||
uint32_t const first_active_j1 = TLC - 1 - (i + 1) ;
|
||||
uint32_t const first_active_j_odd1 = first_active_j1 + (1 - (first_active_j1 % 2));
|
||||
uint32_t const first_active_j_even1 = first_active_j1 + first_active_j1 % 2;
|
||||
// multiply accumulate even part of new row with even part of prev row // needs a carry
|
||||
// j = 0, no carry in, only carry out
|
||||
even[first_active_j_even1 + i + 1] = ptx::mad_lo_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 1]);
|
||||
even[first_active_j_even1 + i + 2] = ptx::madc_hi_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 2]);
|
||||
// for loop, carry in, carry out.
|
||||
#pragma unroll
|
||||
for (j = first_active_j_even1 + 2; j < TLC; j+=2)
|
||||
{
|
||||
even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]);
|
||||
even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]);
|
||||
}
|
||||
even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry
|
||||
|
||||
// multiply accumulate odd part of new row with odd part of prev row
|
||||
// j = 1, no carry in, only carry out
|
||||
odd[first_active_j_odd1 + i] = ptx::mad_lo_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i]);
|
||||
odd[first_active_j_odd1+ i + 1] = ptx::madc_hi_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i + 1]);
|
||||
// for loop, carry in, carry out.
|
||||
#pragma unroll
|
||||
for (j = first_active_j_odd1 + 2; j < TLC; j+=2)
|
||||
{
|
||||
odd[i + j] = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]);
|
||||
odd[i + j + 1] = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// last round, i = TLC - 1
|
||||
odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
|
||||
odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]);
|
||||
// for loop carry in carry out
|
||||
#pragma unroll
|
||||
for (j = 2; j < TLC; j+=2)
|
||||
{
|
||||
odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
|
||||
odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
|
||||
}
|
||||
odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
|
||||
|
||||
// multiply accumulate odd part of new row with even part prev row
|
||||
// j = 1, no carry in, only carry out
|
||||
even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
|
||||
even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]);
|
||||
// for loop carry in carry out
|
||||
#pragma unroll
|
||||
for (j = 3; j < TLC; j+=2)
|
||||
{
|
||||
even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
|
||||
even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
|
||||
}
|
||||
|
||||
// add even and odd parts
|
||||
even[1] = ptx::add_cc(even[1], odd[0]);
|
||||
#pragma unroll
|
||||
for (i = 1; i < 2 * TLC - 2; i++)
|
||||
even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
|
||||
even[i + 1] = ptx::addc(even[i + 1], 0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static DEVICE_INLINE void multiply_lsb_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
// r = a * b is correcrt for the first TLC + 1 digits. (not computing from TLC + 1 to 2*TLC - 2).
|
||||
const uint32_t *a = as.limbs;
|
||||
const uint32_t *b = bs.limbs;
|
||||
uint32_t *even = rs.limbs;
|
||||
__align__(8) uint32_t odd[2 * TLC - 2];
|
||||
mul_n(even, a, b[0]);
|
||||
mul_n(odd, a + 1, b[0]);
|
||||
mad_row(&even[2], &odd[0], a, b[1]);
|
||||
size_t i;
|
||||
#pragma unroll
|
||||
for (i = 2; i < TLC - 1; i += 2) {
|
||||
mad_row(&odd[i], &even[i], a, b[i], TLC - i + 2);
|
||||
mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC - i + 2);
|
||||
}
|
||||
|
||||
// merge |even| and |odd|
|
||||
even[1] = ptx::add_cc(even[1], odd[0]);
|
||||
for (i = 1; i < TLC + 1; i++)
|
||||
even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
|
||||
even[i + 1] = ptx::addc(even[i + 1], 0);
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
const uint32_t *a = as.limbs;
|
||||
const uint32_t *b = bs.limbs;
|
||||
uint32_t *even = rs.limbs;
|
||||
__align__(8) uint32_t odd[2 * TLC - 2];
|
||||
for (int i=0; i<2*TLC - 1; i++)
|
||||
{
|
||||
even[i] = 0;
|
||||
odd[i] = 0;
|
||||
}
|
||||
uint32_t min_indexes_sum = TLC - 1;
|
||||
// only diagonal
|
||||
mul_n_msb(even, a, b[0], TLC, min_indexes_sum);
|
||||
mul_n_msb(odd, a + 1, b[0], TLC, min_indexes_sum - 1);
|
||||
mad_row_msb(&even[2], &odd[0], a, b[1], TLC, min_indexes_sum - 1);
|
||||
size_t i;
|
||||
#pragma unroll
|
||||
for (i = 2; i < TLC - 1; i += 2) {
|
||||
mad_row(&odd[i], &even[i], a, b[i]);
|
||||
mad_row(&even[i + 2], &odd[i], a, b[i + 1]);
|
||||
}
|
||||
// merge |even| and |odd|
|
||||
even[1] = ptx::add_cc(even[1], odd[0]);
|
||||
for (i = 1; i < 2 * TLC - 2; i++)
|
||||
even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
|
||||
even[i + 1] = ptx::addc(even[i + 1], 0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static HOST_INLINE void multiply_raw_host(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
const uint32_t *a = as.limbs;
|
||||
const uint32_t *b = bs.limbs;
|
||||
uint32_t *r = rs.limbs;
|
||||
for (unsigned i = 0; i < TLC; i++) {
|
||||
uint32_t carry = 0;
|
||||
for (unsigned j = 0; j < TLC; j++)
|
||||
for (unsigned j = 0; j < TLC; j++)
|
||||
r[j + i] = host_math::madc_cc(a[j], b[i], r[j + i], carry);
|
||||
r[TLC + i] = carry;
|
||||
}
|
||||
@@ -470,6 +788,22 @@ template <class CONFIG> class Field {
|
||||
#endif
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE void multiply_raw_lsb(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return multiply_lsb_raw_device(as, bs, rs);
|
||||
#else
|
||||
return multiply_raw_host(as, bs, rs);
|
||||
#endif
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE void multiply_raw_msb(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return ingo_msb_multiply_raw_device(as, bs, rs);
|
||||
#else
|
||||
return multiply_raw_host(as, bs, rs);
|
||||
#endif
|
||||
}
|
||||
|
||||
public:
|
||||
ff_storage limbs_storage;
|
||||
|
||||
@@ -541,20 +875,47 @@ template <class CONFIG> class Field {
|
||||
return rs;
|
||||
}
|
||||
|
||||
static constexpr DEVICE_INLINE uint32_t sub_limbs_partial_device(uint32_t *x, uint32_t *y, uint32_t *r, uint32_t num_limbs) {
|
||||
r[0] = ptx::sub_cc(x[0], y[0]);
|
||||
#pragma unroll
|
||||
for (unsigned i = 1; i < num_limbs; i++)
|
||||
r[i] = ptx::subc_cc(x[i], y[i]);
|
||||
return ptx::subc(0, 0);
|
||||
}
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE uint32_t sub_limbs_partial(uint32_t *x, uint32_t *y, uint32_t *r, uint32_t num_limbs) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return sub_limbs_partial_device(x, y, r, num_limbs);
|
||||
#else
|
||||
return sub_limbs_partial_host(x, y, r, num_limbs);
|
||||
#endif
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys) {
|
||||
wide xy = mul_wide(xs, ys);
|
||||
Field xy_hi = xy.get_higher_with_slack();
|
||||
//printf("operator* called \n");
|
||||
wide xy = mul_wide(xs, ys); // full mult
|
||||
Field xy_hi = xy.get_higher_with_slack(); // xy << slack_bits
|
||||
wide l = {};
|
||||
multiply_raw(xy_hi.limbs_storage, get_m(), l.limbs_storage);
|
||||
multiply_raw_msb(xy_hi.limbs_storage, get_m(), l.limbs_storage); // MSB mult
|
||||
Field l_hi = l.get_higher_with_slack();
|
||||
wide lp = {};
|
||||
multiply_raw(l_hi.limbs_storage, get_modulus(), lp.limbs_storage);
|
||||
wide r_wide = xy - lp;
|
||||
multiply_raw_lsb(l_hi.limbs_storage, get_modulus(), lp.limbs_storage); // LSB mult
|
||||
wide r_wide = xy - lp;
|
||||
wide r_wide_reduced = {};
|
||||
uint32_t reduced = sub_limbs<true>(r_wide.limbs_storage, modulus_wide(), r_wide_reduced.limbs_storage);
|
||||
r_wide = reduced ? r_wide : r_wide_reduced;
|
||||
// uint32_t reduced = sub_limbs<true>(r_wide.limbs_storage, modulus_wide(), r_wide_reduced.limbs_storage);
|
||||
// r_wide = reduced ? r_wide : r_wide_reduced;
|
||||
for (unsigned i = 0; i < TLC + 1; i++)
|
||||
{
|
||||
uint32_t carry = sub_limbs_partial(r_wide.limbs_storage.limbs, modulus_wide().limbs, r_wide_reduced.limbs_storage.limbs, TLC + 1);
|
||||
if (carry == 0) // continue to reduce
|
||||
r_wide = r_wide_reduced;
|
||||
else // done
|
||||
break;
|
||||
}
|
||||
|
||||
// number of wrap around is bounded by TLC + 1 times.
|
||||
Field r = r_wide.get_lower();
|
||||
return reduce<1>(r);
|
||||
return (r);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Field& xs, const Field& ys) {
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "test_kernels.cuh"
|
||||
|
||||
#include <iostream>
|
||||
#include <boost/multiprecision/cpp_int.hpp>
|
||||
namespace mp = boost::multiprecision;
|
||||
|
||||
template <class T>
|
||||
int device_populate_random(T* d_elements, unsigned n) {
|
||||
@@ -20,9 +21,19 @@ int device_set(T* d_elements, T el, unsigned n) {
|
||||
return cudaMemcpy(d_elements, h_elements, sizeof(T) * n, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
mp::int1024_t convert_to_boost_mp(uint32_t *a, uint32_t length)
|
||||
{
|
||||
mp::int1024_t res = 0;
|
||||
for (uint32_t i = 0; i < length; i++)
|
||||
{
|
||||
res += (mp::int1024_t)(a[i]) << 32 * i;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
class PrimitivesTest : public ::testing::Test {
|
||||
protected:
|
||||
static const unsigned n = 1 << 5;
|
||||
static const unsigned n = 1 << 10;
|
||||
|
||||
proj *points1{};
|
||||
proj *points2{};
|
||||
@@ -36,6 +47,8 @@ protected:
|
||||
proj *res_points2{};
|
||||
scalar_field *res_scalars1{};
|
||||
scalar_field *res_scalars2{};
|
||||
scalar_field::wide *res_scalars_wide{};
|
||||
scalar_field::wide *res_scalars_wide_full{};
|
||||
|
||||
PrimitivesTest() {
|
||||
assert(!cudaDeviceReset());
|
||||
@@ -51,6 +64,9 @@ protected:
|
||||
assert(!cudaMallocManaged(&res_points2, n * sizeof(proj)));
|
||||
assert(!cudaMallocManaged(&res_scalars1, n * sizeof(scalar_field)));
|
||||
assert(!cudaMallocManaged(&res_scalars2, n * sizeof(scalar_field)));
|
||||
assert(!cudaMallocManaged(&res_scalars_wide, n * sizeof(scalar_field::wide)));
|
||||
assert(!cudaMallocManaged(&res_scalars_wide_full, n * sizeof(scalar_field::wide)));
|
||||
|
||||
}
|
||||
|
||||
~PrimitivesTest() override {
|
||||
@@ -66,6 +82,10 @@ protected:
|
||||
cudaFree(res_points2);
|
||||
cudaFree(res_scalars1);
|
||||
cudaFree(res_scalars2);
|
||||
|
||||
cudaFree(res_scalars_wide);
|
||||
cudaFree(res_scalars_wide_full);
|
||||
|
||||
cudaDeviceReset();
|
||||
}
|
||||
|
||||
@@ -82,6 +102,9 @@ protected:
|
||||
ASSERT_EQ(cudaMemset(res_points2, 0, n * sizeof(proj)), cudaSuccess);
|
||||
ASSERT_EQ(cudaMemset(res_scalars1, 0, n * sizeof(scalar_field)), cudaSuccess);
|
||||
ASSERT_EQ(cudaMemset(res_scalars2, 0, n * sizeof(scalar_field)), cudaSuccess);
|
||||
|
||||
ASSERT_EQ(cudaMemset(res_scalars_wide, 0, n * sizeof(scalar_field::wide)), cudaSuccess);
|
||||
ASSERT_EQ(cudaMemset(res_scalars_wide_full, 0, n * sizeof(scalar_field::wide)), cudaSuccess);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -255,6 +278,189 @@ TEST_F(PrimitivesTest, ECMixedAdditionOfNegatedPointEqSubtraction) {
|
||||
ASSERT_EQ(res_points1[i], points1[i] + res_points2[i]);
|
||||
}
|
||||
|
||||
TEST_F(PrimitivesTest, MP_LSB_MULT) {
|
||||
// LSB multiply, check correctness of first TLC + 1 digits result.
|
||||
ASSERT_EQ(mp_lsb_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
|
||||
std::cout << "first GPU lsb mult output = 0x";
|
||||
for (int i=0; i<2*scalar_field::TLC; i++)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i];
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
|
||||
ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
|
||||
std::cout << "first GPU full mult output = 0x";
|
||||
for (int i=0; i<2*scalar_field::TLC; i++)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i];
|
||||
}
|
||||
std::cout << std::endl;
|
||||
for (int j = 0; j < n; j++)
|
||||
{
|
||||
for (int i=0; i<scalar_field::TLC + 1; i++)
|
||||
{
|
||||
ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(PrimitivesTest, MP_MSB_MULT) {
|
||||
// MSB multiply, take n msb bits of multiplication, assert that the error is up to 1.
|
||||
ASSERT_EQ(mp_msb_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
|
||||
std::cout << "first GPU msb mult output = 0x";
|
||||
for (int i=2*scalar_field::TLC - 1; i >=0 ; i--)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
|
||||
ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
|
||||
std::cout << "first GPU full mult output = 0x";
|
||||
for (int i=2*scalar_field::TLC - 1; i >=0 ; i--)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " ";
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
|
||||
for (int i=0; i < 2*scalar_field::TLC - 1; i++)
|
||||
{
|
||||
if (res_scalars_wide_full[0].limbs_storage.limbs[i] == res_scalars_wide[0].limbs_storage.limbs[i])
|
||||
std::cout << "matched word idx = " << i << std::endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TEST_F(PrimitivesTest, INGO_MP_MULT) {
|
||||
// MSB multiply, take n msb bits of multiplication, assert that the error is up to 1.
|
||||
ASSERT_EQ(ingo_mp_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
|
||||
std::cout << "INGO = 0x";
|
||||
for (int i=0; i < 2*scalar_field::TLC ; i++)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
|
||||
ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
|
||||
std::cout << "ZKSYNC = 0x";
|
||||
for (int i=0; i < 2*scalar_field::TLC ; i++)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " ";
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
|
||||
for (int i=0; i < 2*scalar_field::TLC - 1; i++)
|
||||
{
|
||||
if (res_scalars_wide_full[0].limbs_storage.limbs[i] == res_scalars_wide[0].limbs_storage.limbs[i])
|
||||
std::cout << "matched word idx = " << i << std::endl;
|
||||
}
|
||||
for (int j=0; j<n; j++)
|
||||
{
|
||||
for (int i=0; i < 2*scalar_field::TLC - 1; i++)
|
||||
{
|
||||
ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
TEST_F(PrimitivesTest, INGO_MP_MSB_MULT) {
|
||||
// MSB multiply, take n msb bits of multiplication, assert that the error is up to 1.
|
||||
ASSERT_EQ(ingo_mp_msb_mult(scalars1, scalars2, res_scalars_wide, n), cudaSuccess);
|
||||
std::cout << "INGO MSB = 0x";
|
||||
for (int i=2*scalar_field::TLC - 1; i >= 0 ; i--)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
|
||||
std::cout << "ZKSYNC = 0x";
|
||||
for (int i=2*scalar_field::TLC - 1; i >= 0 ; i--)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " ";
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
|
||||
|
||||
// for (int i=scalar_field::TLC; i < 2*scalar_field::TLC - 1; i++)
|
||||
// {
|
||||
// ASSERT_EQ(in_bound, true);
|
||||
// }
|
||||
// for (int j=0; j<n; j++)
|
||||
// {
|
||||
// for (int i=0; i < 2*scalar_field::TLC - 1; i++)
|
||||
// {
|
||||
// ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]);
|
||||
// }
|
||||
// }
|
||||
// mp testing
|
||||
mp::int1024_t scalar_1_mp = 0;
|
||||
mp::int1024_t scalar_2_mp = 0;
|
||||
mp::int1024_t res_mp = 0;
|
||||
mp::int1024_t res_gpu = 0;
|
||||
uint32_t num_limbs = scalar_field::TLC;
|
||||
|
||||
for (int j=0; j<n; j++)
|
||||
{
|
||||
uint32_t* scalar1_limbs = scalars1[j].limbs_storage.limbs;
|
||||
uint32_t* scalar2_limbs = scalars2[j].limbs_storage.limbs;
|
||||
scalar_1_mp = convert_to_boost_mp(scalar1_limbs, num_limbs);
|
||||
scalar_2_mp = convert_to_boost_mp(scalar2_limbs, num_limbs);
|
||||
res_mp = scalar_1_mp * scalar_2_mp;
|
||||
res_mp = res_mp >> (num_limbs * 32);
|
||||
res_gpu = convert_to_boost_mp(&(res_scalars_wide[j]).limbs_storage.limbs[num_limbs], num_limbs);
|
||||
std::cout << "res mp = " << res_mp << std::endl;
|
||||
std::cout << "res gpu = " << res_gpu << std::endl;
|
||||
std::cout << "error = " << res_mp - res_gpu << std::endl;
|
||||
bool upper_bound = res_gpu <= res_mp;
|
||||
bool lower_bound = res_gpu > (res_mp - num_limbs);
|
||||
bool in_bound = upper_bound && lower_bound;
|
||||
|
||||
|
||||
ASSERT_EQ(in_bound, true);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(PrimitivesTest, INGO_MP_MOD_MULT) {
|
||||
std::cout << " taking num limbs " << std::endl;
|
||||
uint32_t num_limbs = scalar_field::TLC;
|
||||
std::cout << " calling gpu... = " << std::endl;
|
||||
ASSERT_EQ(ingo_mp_mod_mult(scalars1, scalars2, res_scalars1, n), cudaSuccess);
|
||||
std::cout << " gpu call done " << std::endl;
|
||||
// mp testing
|
||||
mp::int1024_t scalar_1_mp = 0;
|
||||
mp::int1024_t scalar_2_mp = 0;
|
||||
mp::int1024_t res_mp = 0;
|
||||
mp::int1024_t res_gpu = 0;
|
||||
mp::int1024_t p = convert_to_boost_mp(scalar_field::get_modulus().limbs, num_limbs);
|
||||
std::cout << " p = " << p << std::endl;
|
||||
|
||||
|
||||
for (int j=0; j<n; j++)
|
||||
{
|
||||
uint32_t* scalar1_limbs = scalars1[j].limbs_storage.limbs;
|
||||
uint32_t* scalar2_limbs = scalars2[j].limbs_storage.limbs;
|
||||
scalar_1_mp = convert_to_boost_mp(scalar1_limbs, num_limbs);
|
||||
scalar_2_mp = convert_to_boost_mp(scalar2_limbs, num_limbs);
|
||||
// std::cout << " s1 = " << scalar_1_mp << std::endl;
|
||||
// std::cout << " s2 = " << scalar_2_mp << std::endl;
|
||||
res_mp = (scalar_1_mp * scalar_2_mp) % p;
|
||||
res_gpu = convert_to_boost_mp((res_scalars1[j]).limbs_storage.limbs, num_limbs);
|
||||
std::cout << "res mp = " << res_mp << std::endl;
|
||||
std::cout << "res gpu = " << res_gpu << std::endl;
|
||||
std::cout << "error = " << res_mp - res_gpu << std::endl;
|
||||
ASSERT_EQ(res_gpu, res_mp);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
|
||||
@@ -105,3 +105,87 @@ int point_vec_to_affine(const proj *x, affine *result, const unsigned count) {
|
||||
int error = cudaGetLastError();
|
||||
return error ? error : cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
__global__ void mp_mult_kernel(const scalar_field *x, const scalar_field *y, scalar_field::wide *result) {
|
||||
const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
scalar_field::multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
|
||||
}
|
||||
|
||||
|
||||
int mp_mult(const scalar_field *x, scalar_field *y, scalar_field::wide *result)
|
||||
{
|
||||
mp_mult_kernel<<<1, 32>>>(x, y, result);
|
||||
int error = cudaGetLastError();
|
||||
return error ? error : cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
|
||||
__global__ void mp_lsb_mult_kernel(const scalar_field *x, const scalar_field *y, scalar_field::wide *result) {
|
||||
const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
scalar_field::multiply_lsb_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
|
||||
}
|
||||
|
||||
|
||||
int mp_lsb_mult(const scalar_field *x, scalar_field *y, scalar_field::wide *result)
|
||||
{
|
||||
mp_lsb_mult_kernel<<<1, 32>>>(x, y, result);
|
||||
int error = cudaGetLastError();
|
||||
return error ? error : cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
__global__ void mp_msb_mult_kernel(const scalar_field *x, const scalar_field *y, scalar_field::wide *result) {
|
||||
const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
scalar_field::multiply_msb_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
|
||||
}
|
||||
|
||||
|
||||
int mp_msb_mult(const scalar_field *x, scalar_field *y, scalar_field::wide *result)
|
||||
{
|
||||
mp_msb_mult_kernel<<<1, 1>>>(x, y, result);
|
||||
int error = cudaGetLastError();
|
||||
return error ? error : cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
__global__ void ingo_mp_mult_kernel(const scalar_field *x, const scalar_field *y, scalar_field::wide *result) {
|
||||
const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
scalar_field::ingo_multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
|
||||
}
|
||||
|
||||
|
||||
int ingo_mp_mult(const scalar_field *x, scalar_field *y, scalar_field::wide *result)
|
||||
{
|
||||
ingo_mp_mult_kernel<<<1, 32>>>(x, y, result);
|
||||
int error = cudaGetLastError();
|
||||
return error ? error : cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
__global__ void ingo_mp_msb_mult_kernel(const scalar_field *x, const scalar_field *y, scalar_field::wide *result) {
|
||||
const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
scalar_field::ingo_msb_multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
|
||||
}
|
||||
|
||||
|
||||
int ingo_mp_msb_mult(const scalar_field *x, scalar_field *y, scalar_field::wide *result, const unsigned n)
|
||||
{
|
||||
ingo_mp_msb_mult_kernel<<<1, n>>>(x, y, result);
|
||||
int error = cudaGetLastError();
|
||||
return error ? error : cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
__global__ void ingo_mp_mod_mult_kernel(const scalar_field *x, const scalar_field *y, scalar_field *result) {
|
||||
const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
result[gid] = x[gid] * y[gid];
|
||||
}
|
||||
|
||||
|
||||
int ingo_mp_mod_mult(const scalar_field *x, scalar_field *y, scalar_field *result, const unsigned n)
|
||||
{
|
||||
ingo_mp_mod_mult_kernel<<<1, n>>>(x, y, result);
|
||||
int error = cudaGetLastError();
|
||||
return error ? error : cudaDeviceSynchronize();
|
||||
}
|
||||
Reference in New Issue
Block a user