mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-13 01:17:57 -05:00
Compare commits
2 Commits
mod_mult_o
...
feat/batch
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ae5c0681ad | ||
|
|
58a2492d60 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -11,4 +11,3 @@
|
||||
**/__pycache__/
|
||||
**/.DS_Store
|
||||
**/Cargo.lock
|
||||
**/icicle/build/
|
||||
|
||||
22
Cargo.toml
22
Cargo.toml
@@ -3,39 +3,25 @@ name = "icicle-utils"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
authors = [ "Ingonyama" ]
|
||||
description = "An implementation of the Ingonyama CUDA Library"
|
||||
description = "An implementation of the Ingonyama Cuda Library"
|
||||
homepage = "https://www.ingonyama.com"
|
||||
repository = "https://github.com/ingonyama-zk/icicle"
|
||||
|
||||
[[bench]]
|
||||
name = "ntt"
|
||||
path = "benches/ntt.rs"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "msm"
|
||||
path = "benches/msm.rs"
|
||||
harness = false
|
||||
|
||||
[dependencies]
|
||||
hex = "*"
|
||||
hex="*"
|
||||
ark-std = "0.3.0"
|
||||
ark-ff = "0.3.0"
|
||||
ark-poly = "0.3.0"
|
||||
ark-ec = { version = "0.3.0", features = [ "parallel" ] }
|
||||
ark-bls12-381 = { version = "0.3.0", optional = true }
|
||||
|
||||
rustacuda = "0.1"
|
||||
rustacuda_core = "0.1"
|
||||
rustacuda_derive = "0.1"
|
||||
|
||||
rand = "*" #TODO: move rand and ark dependencies to dev once random scalar/point generation is done "natively"
|
||||
rand="*" #TODO: sort dependencies that are not required by the release
|
||||
|
||||
[build-dependencies]
|
||||
cc = { version = "1.0", features = ["parallel"] }
|
||||
|
||||
[dev-dependencies]
|
||||
"criterion" = "0.4.0"
|
||||
rand="*"
|
||||
|
||||
[features]
|
||||
default = ["bls12_381"]
|
||||
|
||||
15
README.md
15
README.md
@@ -41,7 +41,7 @@ nvcc -o build/<ENTER_DIR_NAME> ./icicle/appUtils/ntt/ntt.cu ./icicle/appUtils/ms
|
||||
|
||||
### Testing the CUDA code
|
||||
|
||||
We are using [googletest] library for testing. To build and run [the test suite](./icicle/README.md) for finite field and elliptic curve arithmetic, run from the `icicle` folder:
|
||||
We are using [googletest] library for testing. To build and run the test suite for finite field and elliptic curve arithmetic, run from the `icicle` folder:
|
||||
|
||||
```sh
|
||||
mkdir -p build
|
||||
@@ -76,15 +76,6 @@ cargo build --release
|
||||
|
||||
You'll find a release ready library at `target/release/libicicle_utils.rlib`.
|
||||
|
||||
To benchmark and test the functionality available in RUST, run:
|
||||
|
||||
```
|
||||
cargo bench
|
||||
cargo test -- --test-threads=1
|
||||
```
|
||||
|
||||
The flag `--test-threads=1` is needed because currently some tests might interfere with one another inside the GPU.
|
||||
|
||||
### Example Usage
|
||||
|
||||
An example of using the Rust bindings library can be found in our [fast-danksharding implementation][FDI]
|
||||
@@ -93,10 +84,6 @@ An example of using the Rust bindings library can be found in our [fast-dankshar
|
||||
|
||||
Join our [Discord Server](https://discord.gg/Y4SkbDf2Ff) and find us on the icicle channel. We will be happy to work together to support your use case and talk features, bugs and design.
|
||||
|
||||
### Hall of Fame
|
||||
|
||||
- [Robik](https://github.com/robik75), for his on-going support and mentorship
|
||||
|
||||
## License
|
||||
|
||||
ICICLE is distributed under the terms of the MIT License.
|
||||
|
||||
@@ -1,32 +0,0 @@
|
||||
extern crate criterion;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
|
||||
use icicle_utils::{set_up_scalars, generate_random_points, commit_batch, get_rng};
|
||||
use rustacuda::prelude::*;
|
||||
|
||||
|
||||
const LOG_MSM_SIZES: [usize; 1] = [12];
|
||||
const BATCH_SIZES: [usize; 2] = [128, 256];
|
||||
|
||||
fn bench_msm(c: &mut Criterion) {
|
||||
for log_msm_size in LOG_MSM_SIZES {
|
||||
for batch_size in BATCH_SIZES {
|
||||
let msm_size = 1 << log_msm_size;
|
||||
let (scalars, _, _) = set_up_scalars(msm_size, 0, false);
|
||||
let batch_scalars = vec![scalars; batch_size].concat();
|
||||
let mut d_scalars = DeviceBuffer::from_slice(&batch_scalars[..]).unwrap();
|
||||
let points = generate_random_points(msm_size, get_rng(None));
|
||||
let batch_points = vec![points; batch_size].concat();
|
||||
let mut d_points = DeviceBuffer::from_slice(&batch_points[..]).unwrap();
|
||||
|
||||
c.bench_function(
|
||||
&format!("MSM of size 2^{} in batch {}", log_msm_size, batch_size),
|
||||
|b| b.iter(|| commit_batch(&mut d_points, &mut d_scalars, batch_size))
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
criterion_group!(msm_benches, bench_msm);
|
||||
criterion_main!(msm_benches);
|
||||
@@ -1,40 +0,0 @@
|
||||
extern crate criterion;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
|
||||
use icicle_utils::{interpolate_scalars_batch, interpolate_points_batch, set_up_scalars, set_up_points};
|
||||
|
||||
|
||||
const LOG_NTT_SIZES: [usize; 1] = [15];
|
||||
const BATCH_SIZES: [usize; 2] = [8, 16];
|
||||
|
||||
fn bench_point_ntt(c: &mut Criterion) {
|
||||
for log_ntt_size in LOG_NTT_SIZES {
|
||||
for batch_size in BATCH_SIZES {
|
||||
let ntt_size = 1 << log_ntt_size;
|
||||
let (_, mut d_evals, mut d_domain) = set_up_points(ntt_size * batch_size, log_ntt_size, true);
|
||||
|
||||
c.bench_function(
|
||||
&format!("EC NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
|
||||
|b| b.iter(|| interpolate_points_batch(&mut d_evals, &mut d_domain, batch_size))
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn bench_scalar_ntt(c: &mut Criterion) {
|
||||
for log_ntt_size in LOG_NTT_SIZES {
|
||||
for batch_size in BATCH_SIZES {
|
||||
let ntt_size = 1 << log_ntt_size;
|
||||
let (_, mut d_evals, mut d_domain) = set_up_scalars(ntt_size * batch_size, log_ntt_size, true);
|
||||
|
||||
c.bench_function(
|
||||
&format!("Scalar NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
|
||||
|b| b.iter(|| interpolate_scalars_batch(&mut d_evals, &mut d_domain, batch_size))
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
criterion_group!(ntt_benches, bench_point_ntt, bench_scalar_ntt);
|
||||
criterion_main!(ntt_benches);
|
||||
8
build.rs
8
build.rs
@@ -7,7 +7,8 @@ fn main() {
|
||||
println!("cargo:rerun-if-env-changed=CXXFLAGS");
|
||||
println!("cargo:rerun-if-changed=./icicle");
|
||||
|
||||
let arch_type = env::var("ARCH_TYPE").unwrap_or(String::from("native"));
|
||||
let arch_type = env::var("ARCH_TYPE")
|
||||
.unwrap_or(String::from("native"));
|
||||
|
||||
let mut arch = String::from("-arch=");
|
||||
arch.push_str(&arch_type);
|
||||
@@ -19,10 +20,11 @@ fn main() {
|
||||
nvcc.cuda(true);
|
||||
nvcc.debug(false);
|
||||
nvcc.flag(&arch);
|
||||
nvcc.flag("--std=c++17");
|
||||
nvcc.files([
|
||||
"./icicle/appUtils/vector_manipulation/ve_mod_mult.cu",
|
||||
"./icicle/appUtils/ntt/lde.cu",
|
||||
"./icicle/appUtils/ntt/ntt.cu",
|
||||
"./icicle/appUtils/msm/msm.cu",
|
||||
"./icicle/appUtils/vector_manipulation/ve_mod_mult.cu",
|
||||
"./icicle/primitives/projective.cu",
|
||||
]);
|
||||
nvcc.compile("ingo_icicle"); //TODO: extension??
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
cmake_minimum_required(VERSION 3.16)
|
||||
cmake_minimum_required(VERSION 3.14)
|
||||
project(icicle)
|
||||
|
||||
# GoogleTest requires at least C++14
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
@@ -8,9 +9,9 @@ set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
|
||||
# add the target cuda architectures
|
||||
# each additional architecture increases the compilation time and output file size
|
||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
|
||||
set(CMAKE_CUDA_ARCHITECTURES 80)
|
||||
endif ()
|
||||
project(icicle LANGUAGES CUDA CXX)
|
||||
project(bellman-cuda LANGUAGES CUDA CXX)
|
||||
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
|
||||
set(CMAKE_CUDA_FLAGS_RELEASE "")
|
||||
@@ -22,10 +23,6 @@ FetchContent_Declare(
|
||||
URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip
|
||||
)
|
||||
# For Windows: Prevent overriding the parent project's compiler/linker settings
|
||||
|
||||
# boosting lib
|
||||
include_directories("/home/miner/include/boost_1_80_0")
|
||||
|
||||
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
||||
FetchContent_MakeAvailable(googletest)
|
||||
|
||||
|
||||
@@ -1,81 +0,0 @@
|
||||
# Tests
|
||||
|
||||
```sh
|
||||
mkdir -p build; cmake -S . -B build; cmake --build build; cd build && ctest; cd ..
|
||||
```
|
||||
|
||||
## Prerequisites on Ubuntu
|
||||
|
||||
Before proceeding, make sure the following software installed:
|
||||
|
||||
1. CMake at least version 3.16, which can be downloaded from [cmake.org](https://cmake.org/files/)
|
||||
It is recommended to have the latest version installed.
|
||||
2. [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu) version 12.0 or newer.
|
||||
3. GCC - version 9 or newer recommended.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
In case you encounter problems during the build, please follow the points below to troubleshoot:
|
||||
|
||||
### 1 - Check CMake log files
|
||||
|
||||
If there are issues with the CMake configuration, please check the logs which are located in the `./build/CMakeFiles` directory. Depending on the version of CMake, the log file may have a different name. For example, for CMake version 3.20, one of log files is called `CMakeConfigureLog.yaml`.
|
||||
|
||||
### 2 - Check for conflicting GCC versions
|
||||
|
||||
Make sure that there are no conflicting versions of GCC installed. You can use the following commands to install and switch between different versions:
|
||||
|
||||
```sh
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /home/linuxbrew/.linuxbrew/bin/gcc-12 12
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /home/linuxbrew/.linuxbrew/bin/g++-12 12
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9
|
||||
```
|
||||
|
||||
Then you can select with the following command
|
||||
|
||||
```sh
|
||||
sudo update-alternatives --config gcc
|
||||
```
|
||||
|
||||
### 3 - Check for conflicting binaries in PATH
|
||||
|
||||
Make sure that there are no conflicting binaries in the PATH environment variable. For example, if `/home/linuxbrew/.linuxbrew/bin` precedes `/usr/bin/` in the PATH, it will override the `update-alternatives` settings.
|
||||
|
||||
### 4 - Add nvvm path to PATH
|
||||
|
||||
If you encounter the error `cicc not found`, make sure to add the nvvm path to PATH. For example, for CUDA version 12.1, the nvvm path is `/usr/local/cuda-12.1/nvvm/bin`.
|
||||
|
||||
### 5 - Add CUDA libraries to the project
|
||||
|
||||
If you encounter the error `Failed to extract nvcc implicit link line`, add the following code to the CMakeLists.txt file after enabling CUDA:
|
||||
|
||||
```c
|
||||
check_language(CUDA)
|
||||
if(CMAKE_CUDA_COMPILER)
|
||||
enable_language(CUDA)
|
||||
find_package(CUDAToolkit)
|
||||
target_link_libraries(project CUDA::cudart)
|
||||
target_link_libraries(project CUDA::cuda_driver)
|
||||
else()
|
||||
message(STATUS "No CUDA compiler found")
|
||||
endif()
|
||||
```
|
||||
|
||||
### 6 - Fix update alternatives
|
||||
|
||||
If the `update-alternatives` settings are broken, you can try to fix them with the following command:
|
||||
|
||||
`yes '' | update-alternatives --force --all`
|
||||
|
||||
### 7 - ..bin/crt/link.stub: No such file or directory
|
||||
|
||||
If you encounter the error, check if the `$CUDA_HOME/bin/crt/link.stub` file is available.
|
||||
|
||||
Othrewise create a symlink. For example, if the CUDA toolkit is installed with apt-get to the default path, you can create a symlink with the following command:
|
||||
|
||||
`ln -sf /usr/local/cuda-12.1/bin/crt/link.stub /usr/lib/nvidia-cuda-toolkit/bin/crt/link.stub`
|
||||
|
||||
Alternatively, you can replace the old CUDA root with a symlink to the new CUDA installation with the following command:
|
||||
|
||||
`ln -sf /usr/local/cuda-12.1/ /usr/lib/nvidia-cuda-toolkit/`
|
||||
@@ -147,21 +147,16 @@ __global__ void final_accumulation_kernel(P* final_sums, P* final_results, unsig
|
||||
|
||||
//this function computes msm using the bucket method
|
||||
template <typename S, typename P, typename A>
|
||||
void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device) {
|
||||
void bucket_method_msm(unsigned bitsize, unsigned c, S *h_scalars, A *h_points, unsigned size, P*h_final_result){
|
||||
|
||||
S *d_scalars;
|
||||
A *d_points;
|
||||
if (!on_device) {
|
||||
//copy scalars and point to gpu
|
||||
cudaMalloc(&d_scalars, sizeof(S) * size);
|
||||
cudaMalloc(&d_points, sizeof(A) * size);
|
||||
cudaMemcpy(d_scalars, scalars, sizeof(S) * size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_points, points, sizeof(A) * size, cudaMemcpyHostToDevice);
|
||||
}
|
||||
else {
|
||||
d_scalars = scalars;
|
||||
d_points = points;
|
||||
}
|
||||
//copy scalars and point to gpu
|
||||
S *scalars;
|
||||
A *points;
|
||||
|
||||
cudaMalloc(&scalars, sizeof(S) * size);
|
||||
cudaMalloc(&points, sizeof(A) * size);
|
||||
cudaMemcpy(scalars, h_scalars, sizeof(S) * size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(points, h_points, sizeof(A) * size, cudaMemcpyHostToDevice);
|
||||
|
||||
P *buckets;
|
||||
//compute number of bucket modules and number of buckets in each module
|
||||
@@ -173,7 +168,7 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
|
||||
nof_bms++;
|
||||
}
|
||||
unsigned nof_buckets = nof_bms<<c;
|
||||
cudaMalloc(&buckets, sizeof(P) * nof_buckets);
|
||||
cudaMalloc(&buckets, sizeof(P) * nof_buckets);
|
||||
|
||||
// launch the bucket initialization kernel with maximum threads
|
||||
unsigned NUM_THREADS = 1 << 10;
|
||||
@@ -188,8 +183,7 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
|
||||
//split scalars into digits
|
||||
NUM_THREADS = 1 << 10;
|
||||
NUM_BLOCKS = (size * (nof_bms+1) + NUM_THREADS - 1) / NUM_THREADS;
|
||||
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(bucket_indices + size, point_indices + size, d_scalars, size, msm_log_size,
|
||||
nof_bms, bm_bitsize, c); //+size - leaving the first bm free for the out of place sort later
|
||||
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(bucket_indices + size, point_indices + size, scalars, size, msm_log_size, nof_bms, bm_bitsize, c); //+size - leaving the first bm free for the out of place sort later
|
||||
|
||||
//sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to each bucket
|
||||
unsigned *sort_indices_temp_storage{};
|
||||
@@ -234,8 +228,7 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
|
||||
//launch the accumulation kernel with maximum threads
|
||||
NUM_THREADS = 1 << 8;
|
||||
NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
|
||||
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, point_indices,
|
||||
d_points, nof_buckets, 1, c+bm_bitsize);
|
||||
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, point_indices, points, nof_buckets, 1, c+bm_bitsize);
|
||||
|
||||
#ifdef SSM_SUM
|
||||
//sum each bucket
|
||||
@@ -260,25 +253,19 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
|
||||
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(buckets, final_results, nof_bms, c);
|
||||
#endif
|
||||
|
||||
P* d_final_result;
|
||||
if (!on_device)
|
||||
cudaMalloc(&d_final_result, sizeof(P));
|
||||
|
||||
P* final_result;
|
||||
cudaMalloc(&final_result, sizeof(P));
|
||||
//launch the double and add kernel, a single thread
|
||||
final_accumulation_kernel<P, S><<<1,1>>>(final_results, on_device ? final_result : d_final_result, 1, nof_bms, c);
|
||||
final_accumulation_kernel<P, S><<<1,1>>>(final_results, final_result,1, nof_bms, c);
|
||||
|
||||
//copy final result to host
|
||||
cudaDeviceSynchronize();
|
||||
if (!on_device)
|
||||
cudaMemcpy(final_result, d_final_result, sizeof(P), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(h_final_result, final_result, sizeof(P), cudaMemcpyDeviceToHost);
|
||||
|
||||
//free memory
|
||||
if (!on_device) {
|
||||
cudaFree(d_points);
|
||||
cudaFree(d_scalars);
|
||||
cudaFree(d_final_result);
|
||||
}
|
||||
cudaFree(buckets);
|
||||
cudaFree(points);
|
||||
cudaFree(scalars);
|
||||
cudaFree(bucket_indices);
|
||||
cudaFree(point_indices);
|
||||
cudaFree(single_bucket_indices);
|
||||
@@ -286,26 +273,23 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
|
||||
cudaFree(nof_buckets_to_compute);
|
||||
cudaFree(bucket_offsets);
|
||||
cudaFree(final_results);
|
||||
cudaFree(final_result);
|
||||
|
||||
}
|
||||
|
||||
//this function computes msm using the bucket method
|
||||
template <typename S, typename P, typename A>
|
||||
void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device){
|
||||
void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *h_scalars, A *h_points, unsigned batch_size, unsigned msm_size, P*h_final_results){
|
||||
|
||||
//copy scalars and point to gpu
|
||||
S *scalars;
|
||||
A *points;
|
||||
|
||||
unsigned total_size = batch_size * msm_size;
|
||||
S *d_scalars;
|
||||
A *d_points;
|
||||
if (!on_device) {
|
||||
//copy scalars and point to gpu
|
||||
cudaMalloc(&d_scalars, sizeof(S) * total_size);
|
||||
cudaMalloc(&d_points, sizeof(A) * total_size);
|
||||
cudaMemcpy(d_scalars, scalars, sizeof(S) * total_size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_points, points, sizeof(A) * total_size, cudaMemcpyHostToDevice);
|
||||
}
|
||||
else {
|
||||
d_scalars = scalars;
|
||||
d_points = points;
|
||||
}
|
||||
cudaMalloc(&scalars, sizeof(S) * total_size);
|
||||
cudaMalloc(&points, sizeof(A) * total_size);
|
||||
cudaMemcpy(scalars, h_scalars, sizeof(S) * total_size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(points, h_points, sizeof(A) * total_size, cudaMemcpyHostToDevice);
|
||||
|
||||
P *buckets;
|
||||
//compute number of bucket modules and number of buckets in each module
|
||||
@@ -332,8 +316,7 @@ void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *poin
|
||||
//split scalars into digits
|
||||
NUM_THREADS = 1 << 8;
|
||||
NUM_BLOCKS = (total_size * nof_bms + msm_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(bucket_indices + msm_size, point_indices + msm_size, d_scalars, total_size,
|
||||
msm_log_size, nof_bms, bm_bitsize, c); //+size - leaving the first bm free for the out of place sort later
|
||||
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(bucket_indices + msm_size, point_indices + msm_size, scalars, total_size, msm_log_size, nof_bms, bm_bitsize, c); //+size - leaving the first bm free for the out of place sort later
|
||||
|
||||
//sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to each bucket
|
||||
unsigned *sorted_bucket_indices;
|
||||
@@ -385,8 +368,7 @@ void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *poin
|
||||
//launch the accumulation kernel with maximum threads
|
||||
NUM_THREADS = 1 << 8;
|
||||
NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
|
||||
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, sorted_point_indices,
|
||||
d_points, nof_buckets, batch_size, c+bm_bitsize);
|
||||
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, sorted_point_indices, points, nof_buckets, batch_size, c+bm_bitsize);
|
||||
|
||||
#ifdef SSM_SUM
|
||||
//sum each bucket
|
||||
@@ -411,27 +393,21 @@ void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *poin
|
||||
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(buckets, bm_sums, nof_bms*batch_size, c);
|
||||
#endif
|
||||
|
||||
P* d_final_results;
|
||||
if (!on_device)
|
||||
cudaMalloc(&d_final_results, sizeof(P)*batch_size);
|
||||
|
||||
P* final_results;
|
||||
cudaMalloc(&final_results, sizeof(P)*batch_size);
|
||||
//launch the double and add kernel, a single thread for each msm
|
||||
NUM_THREADS = 1<<8;
|
||||
NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
final_accumulation_kernel<P, S><<<NUM_BLOCKS,NUM_THREADS>>>(bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c);
|
||||
final_accumulation_kernel<P, S><<<NUM_BLOCKS,NUM_THREADS>>>(bm_sums, final_results, batch_size, nof_bms, c);
|
||||
|
||||
//copy final result to host
|
||||
cudaDeviceSynchronize();
|
||||
if (!on_device)
|
||||
cudaMemcpy(final_results, d_final_results, sizeof(P)*batch_size, cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(h_final_results, final_results, sizeof(P)*batch_size, cudaMemcpyDeviceToHost);
|
||||
|
||||
//free memory
|
||||
if (!on_device) {
|
||||
cudaFree(d_points);
|
||||
cudaFree(d_scalars);
|
||||
cudaFree(d_final_results);
|
||||
}
|
||||
cudaFree(buckets);
|
||||
cudaFree(points);
|
||||
cudaFree(scalars);
|
||||
cudaFree(bucket_indices);
|
||||
cudaFree(point_indices);
|
||||
cudaFree(sorted_bucket_indices);
|
||||
@@ -441,6 +417,7 @@ void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *poin
|
||||
cudaFree(total_nof_buckets_to_compute);
|
||||
cudaFree(bucket_offsets);
|
||||
cudaFree(bm_sums);
|
||||
cudaFree(final_results);
|
||||
|
||||
}
|
||||
|
||||
@@ -456,7 +433,7 @@ __global__ void to_proj_kernel(A* affine_points, P* proj_points, unsigned N){
|
||||
|
||||
//the function computes msm using ssm
|
||||
template <typename S, typename P, typename A>
|
||||
void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, bool on_device){ //works up to 2^8
|
||||
void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result){ //works up to 2^8
|
||||
|
||||
S *scalars;
|
||||
A *a_points;
|
||||
@@ -521,28 +498,24 @@ void reference_msm(S* scalars, A* a_points, unsigned size){
|
||||
|
||||
}
|
||||
|
||||
unsigned get_optimal_c(const unsigned size) {
|
||||
return 10;
|
||||
}
|
||||
|
||||
//this function is used to compute msms of size larger than 256
|
||||
template <typename S, typename P, typename A>
|
||||
void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device){
|
||||
unsigned c = get_optimal_c(size);
|
||||
void large_msm(S* scalars, A* points, unsigned size, P* result){
|
||||
unsigned c = 10;
|
||||
// unsigned c = 6;
|
||||
// unsigned bitsize = 32;
|
||||
unsigned bitsize = 255;
|
||||
bucket_method_msm(bitsize, c, scalars, points, size, result, on_device);
|
||||
bucket_method_msm(bitsize, c, scalars, points, size, result);
|
||||
}
|
||||
|
||||
// this function is used to compute a batches of msms of size larger than 256
|
||||
template <typename S, typename P, typename A>
|
||||
void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device){
|
||||
unsigned c = get_optimal_c(msm_size);
|
||||
void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result){
|
||||
unsigned c = 10;
|
||||
// unsigned c = 6;
|
||||
// unsigned bitsize = 32;
|
||||
unsigned bitsize = 255;
|
||||
batched_bucket_method_msm(bitsize, c, scalars, points, batch_size, msm_size, result, on_device);
|
||||
batched_bucket_method_msm(bitsize, c, scalars, points, batch_size, msm_size, result);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
@@ -552,10 +525,10 @@ int msm_cuda(projective_t *out, affine_t points[],
|
||||
try
|
||||
{
|
||||
if (count>256){
|
||||
large_msm<scalar_t, projective_t, affine_t>(scalars, points, count, out, false);
|
||||
large_msm<scalar_t, projective_t, affine_t>(scalars, points, count, out);
|
||||
}
|
||||
else{
|
||||
short_msm<scalar_t, projective_t, affine_t>(scalars, points, count, out, false);
|
||||
short_msm<scalar_t, projective_t, affine_t>(scalars, points, count, out);
|
||||
}
|
||||
|
||||
return CUDA_SUCCESS;
|
||||
@@ -572,7 +545,7 @@ extern "C" int msm_batch_cuda(projective_t* out, affine_t points[],
|
||||
{
|
||||
try
|
||||
{
|
||||
batched_large_msm<scalar_t, projective_t, affine_t>(scalars, points, batch_size, msm_size, out, false);
|
||||
batched_large_msm<scalar_t, projective_t, affine_t>(scalars, points, batch_size, msm_size, out);
|
||||
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
@@ -582,50 +555,3 @@ extern "C" int msm_batch_cuda(projective_t* out, affine_t points[],
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Commit to a polynomial using the MSM.
|
||||
* Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
|
||||
* @param d_out Ouptut point to write the result to.
|
||||
* @param d_scalars Scalars for the MSM. Must be on device.
|
||||
* @param d_points Points for the MSM. Must be on device.
|
||||
* @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
|
||||
*/
|
||||
extern "C"
|
||||
int commit_cuda(projective_t* d_out, scalar_t* d_scalars, affine_t* d_points, size_t count, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
large_msm(d_scalars, d_points, count, d_out, true);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Commit to a batch of polynomials using the MSM.
|
||||
* Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
|
||||
* @param d_out Ouptut point to write the results to.
|
||||
* @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
|
||||
* @param d_points Points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM.
|
||||
* @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
|
||||
* @param batch_size Size of the batch.
|
||||
*/
|
||||
extern "C"
|
||||
int commit_batch_cuda(projective_t* d_out, scalar_t* d_scalars, affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,16 +7,10 @@
|
||||
#include "../../curves/curve_config.cuh"
|
||||
|
||||
template <typename S, typename P, typename A>
|
||||
void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device);
|
||||
void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result);
|
||||
|
||||
template <typename S, typename P, typename A>
|
||||
void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device);
|
||||
void large_msm(S* scalars, A* points, unsigned size, P* result);
|
||||
|
||||
template <typename S, typename P, typename A>
|
||||
void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device);
|
||||
|
||||
template <typename S, typename P, typename A>
|
||||
void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device);
|
||||
|
||||
template <typename S, typename P, typename A>
|
||||
void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, bool on_device);
|
||||
void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result);
|
||||
|
||||
@@ -1,463 +0,0 @@
|
||||
#include <cuda.h>
|
||||
#include "ntt.cuh"
|
||||
#include "../vector_manipulation/ve_mod_mult.cuh"
|
||||
#include "lde.cuh"
|
||||
|
||||
|
||||
/**
|
||||
* Interpolate a batch of polynomials from their evaluations on the same subgroup.
|
||||
* Note: this function does not preform any bit-reverse permutations on its inputs or outputs.
|
||||
* @param d_out The variable to write coefficients of the resulting polynomials into (the coefficients are in bit-reversed order if the evaluations weren't bit-reversed and vice-versa).
|
||||
* @param d_evaluations Input array of evaluations of all polynomials of type E (elements).
|
||||
* @param d_domain Domain on which the polynomials are evaluated. Must be a subgroup.
|
||||
* @param n Length of `d_domain` array, also equal to the number of evaluations of each polynomial.
|
||||
* @param batch_size The size of the batch; the length of `d_evaluations` is `n` * `batch_size`.
|
||||
*/
|
||||
template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size) {
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
cudaMemcpy(d_out, d_evaluations, sizeof(E) * n * batch_size, cudaMemcpyDeviceToDevice);
|
||||
|
||||
int NUM_THREADS = min(n / 2, MAX_THREADS_BATCH);
|
||||
int NUM_BLOCKS = batch_size * max(int((n / 2) / NUM_THREADS), 1);
|
||||
for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled
|
||||
{
|
||||
ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>>(d_out, n, d_domain, n, NUM_BLOCKS, s, false);
|
||||
}
|
||||
|
||||
NUM_BLOCKS = (n * batch_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
template_normalize_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>> (d_out, n * batch_size, scalar_t::inv_log_size(logn));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Interpolate a polynomial from its evaluations on a subgroup.
|
||||
* Note: this function does not preform any bit-reverse permutations on its inputs or outputs.
|
||||
* @param d_out The variable to write coefficients of the resulting polynomial into (the coefficients are in bit-reversed order if the evaluations weren't bit-reversed and vice-versa).
|
||||
* @param d_evaluations Input array of evaluations that have type E (elements).
|
||||
* @param d_domain Domain on which the polynomial is evaluated. Must be a subgroup.
|
||||
* @param n Length of `d_evaluations` and the size `d_domain` arrays (they should have equal length).
|
||||
*/
|
||||
template <typename E, typename S> int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n) {
|
||||
return interpolate_batch <E, S> (d_out, d_evaluations, d_domain, n, 1);
|
||||
}
|
||||
|
||||
template < typename E > __global__ void fill_array(E * arr, E val, uint32_t n) {
|
||||
int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if (tid < n) {
|
||||
arr[tid] = val;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate a batch of polynomials on the same coset.
|
||||
* @param d_out The evaluations of the polynomials on coset `u` * `d_domain`.
|
||||
* @param d_coefficients Input array of coefficients of all polynomials of type E (elements) to be evaluated in-place on a coset.
|
||||
* @param d_domain Domain on which the polynomials are evaluated (see `coset` flag). Must be a subgroup.
|
||||
* @param domain_size Length of `d_domain` array, on which the polynomial is computed.
|
||||
* @param n The number of coefficients, which might be different from `domain_size`.
|
||||
* @param batch_size The size of the batch; the length of `d_coefficients` is `n` * `batch_size`.
|
||||
* @param coset The flag that indicates whether to evaluate on a coset. If false, evaluate on a subgroup `d_domain`.
|
||||
* @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of the coset.
|
||||
*/
|
||||
template <typename E, typename S>
|
||||
int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, unsigned batch_size, bool coset, S * coset_powers) {
|
||||
uint32_t logn = uint32_t(log(domain_size) / log(2));
|
||||
if (domain_size > n) {
|
||||
// allocate and initialize an array of stream handles to parallelize data copying across batches
|
||||
cudaStream_t *memcpy_streams = (cudaStream_t *) malloc(batch_size * sizeof(cudaStream_t));
|
||||
for (int i = 0; i < batch_size; i++)
|
||||
{
|
||||
cudaStreamCreate(&(memcpy_streams[i]));
|
||||
|
||||
cudaMemcpyAsync(&d_out[i * domain_size], &d_coefficients[i * n], n * sizeof(E), cudaMemcpyDeviceToDevice, memcpy_streams[i]);
|
||||
int NUM_THREADS = MAX_THREADS_BATCH;
|
||||
int NUM_BLOCKS = (domain_size - n + NUM_THREADS - 1) / NUM_THREADS;
|
||||
fill_array <E> <<<NUM_BLOCKS, NUM_THREADS, 0, memcpy_streams[i]>>> (&d_out[i * domain_size + n], E::zero(), domain_size - n);
|
||||
|
||||
cudaStreamSynchronize(memcpy_streams[i]);
|
||||
cudaStreamDestroy(memcpy_streams[i]);
|
||||
}
|
||||
} else
|
||||
cudaMemcpy(d_out, d_coefficients, sizeof(E) * domain_size * batch_size, cudaMemcpyDeviceToDevice);
|
||||
|
||||
if (coset)
|
||||
batch_vector_mult(coset_powers, d_out, domain_size, batch_size);
|
||||
|
||||
int NUM_THREADS = min(domain_size / 2, MAX_THREADS_BATCH);
|
||||
int chunks = max(int((domain_size / 2) / NUM_THREADS), 1);
|
||||
int NUM_BLOCKS = batch_size * chunks;
|
||||
for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled
|
||||
{
|
||||
ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>>(d_out, domain_size, d_domain, domain_size, batch_size * chunks, logn - s - 1, true);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate a polynomial on a coset.
|
||||
* Note: this function does not preform any bit-reverse permutations on its inputs or outputs, so the order of outputs is bit-reversed.
|
||||
* @param d_out The evaluations of the polynomial on coset `u` * `d_domain`.
|
||||
* @param d_coefficients Input array of coefficients of a polynomial of type E (elements).
|
||||
* @param d_domain Domain on which the polynomial is evaluated (see `coset` flag). Must be a subgroup.
|
||||
* @param domain_size Length of `d_domain` array, on which the polynomial is computed.
|
||||
* @param n The number of coefficients, which might be different from `domain_size`.
|
||||
* @param coset The flag that indicates whether to evaluate on a coset. If false, evaluate on a subgroup `d_domain`.
|
||||
* @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of the coset.
|
||||
*/
|
||||
template <typename E, typename S>
|
||||
int evaluate(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, bool coset, S * coset_powers) {
|
||||
return evaluate_batch <E, S> (d_out, d_coefficients, d_domain, domain_size, n, 1, coset, coset_powers);
|
||||
}
|
||||
|
||||
int interpolate_scalars(scalar_t* d_out, scalar_t* d_evaluations, scalar_t* d_domain, unsigned n) {
|
||||
return interpolate(d_out, d_evaluations, d_domain, n);
|
||||
}
|
||||
|
||||
int interpolate_scalars_batch(scalar_t* d_out, scalar_t* d_evaluations, scalar_t* d_domain, unsigned n, unsigned batch_size) {
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
|
||||
}
|
||||
|
||||
int interpolate_points(projective_t* d_out, projective_t* d_evaluations, scalar_t* d_domain, unsigned n) {
|
||||
return interpolate(d_out, d_evaluations, d_domain, n);
|
||||
}
|
||||
|
||||
int interpolate_points_batch(projective_t* d_out, projective_t* d_evaluations, scalar_t* d_domain, unsigned n, unsigned batch_size) {
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
|
||||
}
|
||||
|
||||
int evaluate_scalars(scalar_t* d_out, scalar_t* d_coefficients, scalar_t* d_domain, unsigned domain_size, unsigned n) {
|
||||
scalar_t* _null = nullptr;
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
|
||||
}
|
||||
|
||||
int evaluate_scalars_batch(scalar_t* d_out, scalar_t* d_coefficients, scalar_t* d_domain, unsigned domain_size, unsigned n, unsigned batch_size) {
|
||||
scalar_t* _null = nullptr;
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
|
||||
}
|
||||
|
||||
int evaluate_points(projective_t* d_out, projective_t* d_coefficients, scalar_t* d_domain, unsigned domain_size, unsigned n) {
|
||||
scalar_t* _null = nullptr;
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
|
||||
}
|
||||
|
||||
int evaluate_points_batch(projective_t* d_out, projective_t* d_coefficients, scalar_t* d_domain,
|
||||
unsigned domain_size, unsigned n, unsigned batch_size) {
|
||||
scalar_t* _null = nullptr;
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
|
||||
}
|
||||
|
||||
int evaluate_scalars_on_coset(scalar_t* d_out, scalar_t* d_coefficients, scalar_t* d_domain,
|
||||
unsigned domain_size, unsigned n, scalar_t* coset_powers) {
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
|
||||
}
|
||||
|
||||
int evaluate_scalars_on_coset_batch(scalar_t* d_out, scalar_t* d_coefficients, scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, scalar_t* coset_powers) {
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
|
||||
}
|
||||
|
||||
int evaluate_points_on_coset(projective_t* d_out, projective_t* d_coefficients, scalar_t* d_domain,
|
||||
unsigned domain_size, unsigned n, scalar_t* coset_powers) {
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
|
||||
}
|
||||
|
||||
int evaluate_points_on_coset_batch(projective_t* d_out, projective_t* d_coefficients, scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, scalar_t* coset_powers) {
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
|
||||
}
|
||||
|
||||
extern "C" scalar_t* build_domain_cuda(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (inverse) {
|
||||
return fill_twiddle_factors_array(domain_size, scalar_t::omega_inv(logn));
|
||||
} else {
|
||||
return fill_twiddle_factors_array(domain_size, scalar_t::omega(logn));
|
||||
}
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ntt_cuda(scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end(arr, n, inverse); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ecntt_cuda(projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ecntt_end2end(arr, n, inverse); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ntt_batch_cuda(scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_batch(arr, arr_size, batch_size, inverse); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ecntt_batch_cuda(projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ecntt_end2end_batch(arr, arr_size, batch_size, inverse); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int interpolate_scalars_cuda(scalar_t* d_out, scalar_t *d_evaluations, scalar_t *d_domain, unsigned n, unsigned device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate_scalars(d_out, d_evaluations, d_domain, n); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int interpolate_scalars_batch_cuda(scalar_t* d_out, scalar_t* d_evaluations, scalar_t* d_domain, unsigned n,
|
||||
unsigned batch_size, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate_scalars_batch(d_out, d_evaluations, d_domain, n, batch_size); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int interpolate_points_cuda(projective_t* d_out, projective_t *d_evaluations, scalar_t *d_domain, unsigned n, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate_points(d_out, d_evaluations, d_domain, n); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int interpolate_points_batch_cuda(projective_t* d_out, projective_t* d_evaluations, scalar_t* d_domain,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate_points_batch(d_out, d_evaluations, d_domain, n, batch_size); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_cuda(scalar_t* d_out, scalar_t *d_coefficients, scalar_t *d_domain,
|
||||
unsigned domain_size, unsigned n, unsigned device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_scalars(d_out, d_coefficients, d_domain, domain_size, n); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_batch_cuda(scalar_t* d_out, scalar_t* d_coefficients, scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_scalars_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_cuda(projective_t* d_out, projective_t *d_coefficients, scalar_t *d_domain,
|
||||
unsigned domain_size, unsigned n, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_points(d_out, d_coefficients, d_domain, domain_size, n); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_batch_cuda(projective_t* d_out, projective_t* d_coefficients, scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_points_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_on_coset_cuda(scalar_t* d_out, scalar_t *d_coefficients, scalar_t *d_domain, unsigned domain_size,
|
||||
unsigned n, scalar_t *coset_powers, unsigned device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_scalars_on_coset(d_out, d_coefficients, d_domain, domain_size, n, coset_powers); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_on_coset_batch_cuda(scalar_t* d_out, scalar_t* d_coefficients, scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, scalar_t *coset_powers, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_scalars_on_coset_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, coset_powers); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_on_coset_cuda(projective_t* d_out, projective_t *d_coefficients, scalar_t *d_domain, unsigned domain_size,
|
||||
unsigned n, scalar_t *coset_powers, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_points_on_coset(d_out, d_coefficients, d_domain, domain_size, n, coset_powers); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_on_coset_batch_cuda(projective_t* d_out, projective_t* d_coefficients, scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, scalar_t *coset_powers, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_points_on_coset_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, coset_powers); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_scalars_cuda(scalar_t* arr, int n, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order(arr, n, logn);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_scalars_batch_cuda(scalar_t* arr, int n, int batch_size, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order_batch(arr, n, logn, batch_size);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_points_cuda(projective_t* arr, int n, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order(arr, n, logn);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_points_batch_cuda(projective_t* arr, int n, int batch_size, size_t device_id = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order_batch(arr, n, logn, batch_size);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
|
||||
int interpolate_scalars(scalar_t* d_out, scalar_t* d_evaluations, scalar_t* d_domain, unsigned n);
|
||||
|
||||
int interpolate_scalars_batch(scalar_t* d_out, scalar_t* d_evaluations, scalar_t* d_domain, unsigned n, unsigned batch_size);
|
||||
|
||||
int interpolate_points(projective_t* d_out, projective_t* d_evaluations, scalar_t* d_domain, unsigned n);
|
||||
|
||||
int interpolate_points_batch(projective_t* d_out, projective_t* d_evaluations, scalar_t* d_domain, unsigned n, unsigned batch_size);
|
||||
|
||||
int evaluate_scalars(scalar_t* d_out, scalar_t* d_coefficients, scalar_t* d_domain, unsigned domain_size, unsigned n);
|
||||
|
||||
int evaluate_scalars_batch(scalar_t* d_out, scalar_t* d_coefficients, scalar_t* d_domain, unsigned domain_size, unsigned n, unsigned batch_size);
|
||||
|
||||
int evaluate_points(projective_t* d_out, projective_t* d_coefficients, scalar_t* d_domain, unsigned domain_size, unsigned n);
|
||||
|
||||
int evaluate_points_batch(projective_t* d_out, projective_t* d_coefficients, scalar_t* d_domain,
|
||||
unsigned domain_size, unsigned n, unsigned batch_size);
|
||||
|
||||
int evaluate_scalars_on_coset(scalar_t* d_out, scalar_t* d_coefficients, scalar_t* d_domain,
|
||||
unsigned domain_size, unsigned n, scalar_t* coset_powers);
|
||||
|
||||
int evaluate_scalars_on_coset_batch(scalar_t* d_out, scalar_t* d_coefficients, scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, scalar_t* coset_powers);
|
||||
|
||||
int evaluate_points_on_coset(projective_t* d_out, projective_t* d_coefficients, scalar_t* d_domain,
|
||||
unsigned domain_size, unsigned n, scalar_t* coset_powers);
|
||||
|
||||
int evaluate_points_on_coset_batch(projective_t* d_out, projective_t* d_coefficients, scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, scalar_t* coset_powers);
|
||||
@@ -10,6 +10,7 @@ extern "C" int ntt_cuda(scalar_t *arr, uint32_t n, bool inverse, size_t device_i
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what());
|
||||
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,6 +25,9 @@ const uint32_t MAX_THREADS_BATCH = 256;
|
||||
* @param omega multiplying factor.
|
||||
*/
|
||||
__global__ void twiddle_factors_kernel(scalar_t * d_twiddles, uint32_t n_twiddles, scalar_t omega) {
|
||||
for (uint32_t i = 0; i < n_twiddles; i++) {
|
||||
d_twiddles[i] = scalar_t::zero();
|
||||
}
|
||||
d_twiddles[0] = scalar_t::one();
|
||||
for (uint32_t i = 0; i < n_twiddles - 1; i++) {
|
||||
d_twiddles[i + 1] = omega * d_twiddles[i];
|
||||
@@ -46,27 +49,28 @@ scalar_t * fill_twiddle_factors_array(uint32_t n_twiddles, scalar_t omega) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the bit reversed order of a number.
|
||||
* Returens the bit reversed order of a number.
|
||||
* for example: on inputs num = 6 (110 in binary) and logn = 3
|
||||
* the function should return 3 (011 in binary.)
|
||||
* @param num some number with bit representation of size logn.
|
||||
* @param logn length of bit representation of `num`.
|
||||
* @return bit reveresed order or `num`.
|
||||
* @param logn length of bit representation of num.
|
||||
* @return bit reveresed order or num.
|
||||
*/
|
||||
__device__ __host__ uint32_t reverseBits(uint32_t num, uint32_t logn) {
|
||||
unsigned int reverse_num = 0;
|
||||
for (int i = 0; i < logn; i++) {
|
||||
int i;
|
||||
for (i = 0; i < logn; i++) {
|
||||
if ((num & (1 << i))) reverse_num |= 1 << ((logn - 1) - i);
|
||||
}
|
||||
return reverse_num;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the bit reversal ordering of the input array.
|
||||
* Returens the bit reversal ordering of the input array.
|
||||
* for example: on input ([a[0],a[1],a[2],a[3]], 4, 2) it returns
|
||||
* [a[0],a[3],a[2],a[1]] (elements in indices 3,1 swhich places).
|
||||
* @param arr array of some object of type T of size which is a power of 2.
|
||||
* @param n length of `arr`.
|
||||
* @param n length of arr.
|
||||
* @param logn log(n).
|
||||
* @return A new array which is the bit reversed version of input array.
|
||||
*/
|
||||
@@ -79,55 +83,14 @@ template < typename T > T * template_reverse_order(T * arr, uint32_t n, uint32_t
|
||||
return arrReversed;
|
||||
}
|
||||
|
||||
template < typename T > __global__ void reverse_order_kernel(T* arr, T* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size) {
|
||||
int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if (threadId < n * batch_size) {
|
||||
int idx = threadId % n;
|
||||
int batch_idx = threadId / n;
|
||||
int idx_reversed = __brev(idx) >> (32 - logn);
|
||||
arr_reversed[batch_idx * n + idx_reversed] = arr[batch_idx * n + idx];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Bit-reverses a batch of input arrays in-place inside GPU.
|
||||
* for example: on input array ([a[0],a[1],a[2],a[3]], 4, 2) it returns
|
||||
* [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places).
|
||||
* @param arr batch of arrays of some object of type T. Should be on GPU.
|
||||
* @param n length of `arr`.
|
||||
* @param logn log(n).
|
||||
* @param batch_size the size of the batch.
|
||||
*/
|
||||
template < typename T > void reverse_order_batch(T* arr, uint32_t n, uint32_t logn, uint32_t batch_size) {
|
||||
T* arr_reversed;
|
||||
cudaMalloc(&arr_reversed, n * batch_size * sizeof(T));
|
||||
int number_of_threads = MAX_THREADS_BATCH;
|
||||
int number_of_blocks = (n * batch_size + number_of_threads - 1) / number_of_threads;
|
||||
reverse_order_kernel <<<number_of_blocks, number_of_threads>>> (arr, arr_reversed, n, logn, batch_size);
|
||||
cudaMemcpy(arr, arr_reversed, n * batch_size * sizeof(T), cudaMemcpyDeviceToDevice);
|
||||
cudaFree(arr_reversed);
|
||||
}
|
||||
|
||||
/**
|
||||
* Bit-reverses an input array in-place inside GPU.
|
||||
* for example: on array ([a[0],a[1],a[2],a[3]], 4, 2) it returns
|
||||
* [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places).
|
||||
* @param arr array of some object of type T of size which is a power of 2. Should be on GPU.
|
||||
* @param n length of `arr`.
|
||||
* @param logn log(n).
|
||||
*/
|
||||
template < typename T > void reverse_order(T* arr, uint32_t n, uint32_t logn) {
|
||||
reverse_order_batch(arr, n, logn, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Cooley-Tukey butterfly kernel.
|
||||
* Cooley-Tuckey butterfly kernel.
|
||||
* @param arr array of objects of type E (elements).
|
||||
* @param twiddles array of twiddle factors of type S (scalars).
|
||||
* @param n size of arr.
|
||||
* @param n_twiddles size of omegas.
|
||||
* @param m "pair distance" - indicate distance of butterflies inputs.
|
||||
* @param i Cooley-Tukey FFT stage number.
|
||||
* @param i Cooley-TUckey FFT stage number.
|
||||
* @param max_thread_num maximal number of threads in stage.
|
||||
*/
|
||||
template < typename E, typename S > __global__ void template_butterfly_kernel(E * arr, S * twiddles, uint32_t n, uint32_t n_twiddles, uint32_t m, uint32_t i, uint32_t max_thread_num) {
|
||||
@@ -143,26 +106,27 @@ template < typename E, typename S > __global__ void template_butterfly_kernel(E
|
||||
}
|
||||
|
||||
/**
|
||||
* Multiply the elements of an input array by a scalar in-place.
|
||||
* @param arr input array.
|
||||
* @param n size of arr.
|
||||
* @param n_inv scalar of type S (scalar).
|
||||
* Set the elements of arr to be the elements of res multiplied by some scalar.
|
||||
* @param arr input array.
|
||||
* @param res result array.
|
||||
* @param n size of arr.
|
||||
* @param n_inv scalar of type S (scalar).
|
||||
*/
|
||||
template < typename E, typename S > __global__ void template_normalize_kernel(E * arr, uint32_t n, S scalar) {
|
||||
template < typename E, typename S > __global__ void template_normalize_kernel(E * arr, E * res, uint32_t n, S scalar) {
|
||||
int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if (tid < n) {
|
||||
arr[tid] = scalar * arr[tid];
|
||||
res[tid] = scalar * arr[tid];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Cooley-Tukey NTT.
|
||||
* Cooley-Tuckey NTT.
|
||||
* NOTE! this function assumes that d_arr and d_twiddles are located in the device memory.
|
||||
* @param d_arr input array of type E (elements) allocated on the device memory.
|
||||
* @param d_arr input array of type E (elements) allocated on the device memory.
|
||||
* @param n length of d_arr.
|
||||
* @param logn log(n).
|
||||
* @param d_twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
|
||||
* @param n_twiddles length of d_twiddles.
|
||||
* @param logn log(n).
|
||||
* @param d_twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
|
||||
* @param n_twiddles length of d_twiddles.
|
||||
*/
|
||||
template < typename E, typename S > void template_ntt_on_device_memory(E * d_arr, uint32_t n, uint32_t logn, S * d_twiddles, uint32_t n_twiddles) {
|
||||
uint32_t m = 2;
|
||||
@@ -178,7 +142,7 @@ template < typename E, typename S > void template_ntt_on_device_memory(E * d_arr
|
||||
}
|
||||
|
||||
/**
|
||||
* Cooley-Tukey NTT.
|
||||
* Cooley-Tuckey NTT.
|
||||
* NOTE! this function assumes that d_twiddles are located in the device memory.
|
||||
* @param arr input array of type E (elements).
|
||||
* @param n length of d_arr.
|
||||
@@ -194,10 +158,10 @@ template < typename E, typename S > E * ntt_template(E * arr, uint32_t n, S * d_
|
||||
cudaMalloc( & d_arrReversed, size_E);
|
||||
cudaMemcpy(d_arrReversed, arrReversed, size_E, cudaMemcpyHostToDevice);
|
||||
template_ntt_on_device_memory < E, S > (d_arrReversed, n, logn, d_twiddles, n_twiddles);
|
||||
if (inverse) {
|
||||
if (inverse == true) {
|
||||
int NUM_THREADS = MAX_NUM_THREADS;
|
||||
int NUM_BLOCKS = (n + NUM_THREADS - 1) / NUM_THREADS;
|
||||
template_normalize_kernel < E, S > <<< NUM_THREADS, NUM_BLOCKS >>> (d_arrReversed, n, S::inv_log_size(logn));
|
||||
template_normalize_kernel < E, S > <<< NUM_THREADS, NUM_BLOCKS >>> (d_arrReversed, d_arrReversed, n, S::inv_log_size(logn));
|
||||
}
|
||||
cudaMemcpy(arrReversed, d_arrReversed, size_E, cudaMemcpyDeviceToHost);
|
||||
cudaFree(d_arrReversed);
|
||||
@@ -205,7 +169,7 @@ template < typename E, typename S > E * ntt_template(E * arr, uint32_t n, S * d_
|
||||
}
|
||||
|
||||
/**
|
||||
* Cooley-Tukey Elliptic Curve NTT.
|
||||
* Cooley-Tuckey Elliptic Curve NTT.
|
||||
* NOTE! this function assumes that d_twiddles are located in the device memory.
|
||||
* @param arr input array of type projective_t.
|
||||
* @param n length of d_arr.
|
||||
@@ -218,7 +182,7 @@ projective_t * ecntt(projective_t * arr, uint32_t n, scalar_t * d_twiddles, uint
|
||||
}
|
||||
|
||||
/**
|
||||
* Cooley-Tukey (scalar) NTT.
|
||||
* Cooley-Tuckey (scalar) NTT.
|
||||
* NOTE! this function assumes that d_twiddles are located in the device memory.
|
||||
* @param arr input array of type scalar_t.
|
||||
* @param n length of d_arr.
|
||||
@@ -232,7 +196,7 @@ scalar_t * ntt(scalar_t * arr, uint32_t n, scalar_t * d_twiddles, uint32_t n_twi
|
||||
|
||||
|
||||
/**
|
||||
* Cooley-Tukey (scalar) NTT.
|
||||
* Cooley-Tuckey (scalar) NTT.
|
||||
* @param arr input array of type scalar_t.
|
||||
* @param n length of d_arr.
|
||||
* @param inverse indicate if the result array should be normalized by n^(-1).
|
||||
@@ -243,7 +207,7 @@ scalar_t * ntt(scalar_t * arr, uint32_t n, scalar_t * d_twiddles, uint32_t n_twi
|
||||
scalar_t * d_twiddles;
|
||||
if (inverse){
|
||||
d_twiddles = fill_twiddle_factors_array(n_twiddles, scalar_t::omega_inv(logn));
|
||||
} else {
|
||||
} else{
|
||||
d_twiddles = fill_twiddle_factors_array(n_twiddles, scalar_t::omega(logn));
|
||||
}
|
||||
scalar_t * result = ntt_template < scalar_t, scalar_t > (arr, n, d_twiddles, n_twiddles, inverse);
|
||||
@@ -256,7 +220,7 @@ scalar_t * ntt(scalar_t * arr, uint32_t n, scalar_t * d_twiddles, uint32_t n_twi
|
||||
|
||||
|
||||
/**
|
||||
* Cooley-Tukey (scalar) NTT.
|
||||
* Cooley-Tuckey (scalar) NTT.
|
||||
* @param arr input array of type projective_t.
|
||||
* @param n length of d_arr.
|
||||
* @param inverse indicate if the result array should be normalized by n^(-1).
|
||||
@@ -302,7 +266,7 @@ scalar_t * ntt(scalar_t * arr, uint32_t n, scalar_t * d_twiddles, uint32_t n_twi
|
||||
|
||||
|
||||
/**
|
||||
* Cooley-Tukey butterfly kernel.
|
||||
* Cooley-Tuckey butterfly kernel.
|
||||
* @param arr array of objects of type E (elements).
|
||||
* @param twiddles array of twiddle factors of type S (scalars).
|
||||
* @param n size of arr.
|
||||
@@ -321,7 +285,7 @@ template < typename E, typename S > __device__ __host__ void butterfly(E * arrRe
|
||||
}
|
||||
|
||||
/**
|
||||
* Cooley-Tukey NTT.
|
||||
* Cooley-Tuckey NTT.
|
||||
* NOTE! this function assumes that d_twiddles are located in the device memory.
|
||||
* @param arr input array of type E (elements).
|
||||
* @param n length of d_arr.
|
||||
@@ -331,7 +295,7 @@ template < typename E, typename S > __device__ __host__ void butterfly(E * arrRe
|
||||
* @param s log2(n) loop index.
|
||||
*/
|
||||
template <typename E, typename S>
|
||||
__global__ void ntt_template_kernel(E *arr, uint32_t n, S *twiddles, uint32_t n_twiddles, uint32_t max_task, uint32_t s, bool rev)
|
||||
__global__ void ntt_template_kernel(E *arr, uint32_t n, S *twiddles, uint32_t n_twiddles, uint32_t max_task, uint32_t s)
|
||||
{
|
||||
int task = blockIdx.x;
|
||||
int chunks = n / (blockDim.x * 2);
|
||||
@@ -358,18 +322,16 @@ __global__ void ntt_template_kernel(E *arr, uint32_t n, S *twiddles, uint32_t n_
|
||||
|
||||
uint32_t offset = (task / chunks) * n;
|
||||
E u = arr[offset + i + j];
|
||||
E v = rev ? arr[offset + k] : twiddles[j * n_twiddles_div] * arr[offset + k];
|
||||
E v = twiddles[j * n_twiddles_div] * arr[offset + k];
|
||||
arr[offset + i + j] = u + v;
|
||||
arr[offset + k] = u - v;
|
||||
if (rev)
|
||||
arr[offset + k] = twiddles[j * n_twiddles_div] * arr[offset + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Cooley-Tukey NTT.
|
||||
* Cooley-Tuckey NTT.
|
||||
* NOTE! this function assumes that d_twiddles are located in the device memory.
|
||||
* @param arr input array of type E (elements).
|
||||
* @param n length of arr.
|
||||
@@ -390,7 +352,7 @@ __global__ void ntt_template_kernel_rev_ord(E *arr, uint32_t n, uint32_t logn, u
|
||||
|
||||
//TODO: batch ntt and ecntt can be unified into batch_template
|
||||
/**
|
||||
* Cooley-Tukey (scalar) NTT.
|
||||
* Cooley-Tuckey (scalar) NTT.
|
||||
* This is a bached version - meaning it assumes than the input array
|
||||
* consists of N arrays of size n. The function performs n-size NTT on each small array.
|
||||
* @param arr input array of type scalar_t.
|
||||
@@ -423,14 +385,14 @@ __global__ void ntt_template_kernel_rev_ord(E *arr, uint32_t n, uint32_t logn, u
|
||||
|
||||
for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled
|
||||
{
|
||||
ntt_template_kernel<scalar_t, scalar_t><<<NUM_BLOCKS, NUM_THREADS>>>(d_arr, n, d_twiddles, n_twiddles, total_tasks, s, false);
|
||||
ntt_template_kernel<scalar_t, scalar_t><<<NUM_BLOCKS, NUM_THREADS>>>(d_arr, n, d_twiddles, n_twiddles, total_tasks, s);
|
||||
}
|
||||
|
||||
if (inverse == true)
|
||||
{
|
||||
NUM_THREADS = MAX_NUM_THREADS;
|
||||
NUM_BLOCKS = (arr_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
template_normalize_kernel < scalar_t, scalar_t > <<< NUM_THREADS, NUM_BLOCKS >>> (d_arr, arr_size, scalar_t::inv_log_size(logn));
|
||||
template_normalize_kernel < scalar_t, scalar_t > <<< NUM_THREADS, NUM_BLOCKS >>> (d_arr, d_arr, arr_size, scalar_t::inv_log_size(logn));
|
||||
}
|
||||
cudaMemcpy(arr, d_arr, size_E, cudaMemcpyDeviceToHost);
|
||||
cudaFree(d_arr);
|
||||
@@ -439,7 +401,7 @@ __global__ void ntt_template_kernel_rev_ord(E *arr, uint32_t n, uint32_t logn, u
|
||||
}
|
||||
|
||||
/**
|
||||
* Cooley-Tukey (scalar) NTT.
|
||||
* Cooley-Tuckey (scalar) NTT.
|
||||
* This is a bached version - meaning it assumes than the input array
|
||||
* consists of N arrays of size n. The function performs n-size NTT on each small array.
|
||||
* @param arr input array of type scalar_t.
|
||||
@@ -472,13 +434,14 @@ __global__ void ntt_template_kernel_rev_ord(E *arr, uint32_t n, uint32_t logn, u
|
||||
|
||||
for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled
|
||||
{
|
||||
ntt_template_kernel<projective_t, scalar_t><<<NUM_BLOCKS, NUM_THREADS>>>(d_arr, n, d_twiddles, n_twiddles, total_tasks, s, false);
|
||||
ntt_template_kernel<projective_t, scalar_t><<<NUM_BLOCKS, NUM_THREADS>>>(d_arr, n, d_twiddles, n_twiddles, total_tasks, s);
|
||||
}
|
||||
if (inverse == true)
|
||||
{
|
||||
NUM_THREADS = MAX_NUM_THREADS;
|
||||
NUM_BLOCKS = (arr_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
template_normalize_kernel < projective_t, scalar_t > <<< NUM_THREADS, NUM_BLOCKS >>> (d_arr, arr_size, scalar_t::inv_log_size(logn));
|
||||
// TODO: no correctnes when swapped NUM_THREADS, NUM_BLOCKS
|
||||
template_normalize_kernel < projective_t, scalar_t > <<< NUM_THREADS, NUM_BLOCKS >>> (d_arr, d_arr, arr_size, scalar_t::inv_log_size(logn));
|
||||
}
|
||||
cudaMemcpy(arr, d_arr, size_E, cudaMemcpyDeviceToHost);
|
||||
cudaFree(d_arr);
|
||||
|
||||
@@ -7,6 +7,169 @@
|
||||
#include "ve_mod_mult.cuh"
|
||||
|
||||
|
||||
#define MAX_THREADS_PER_BLOCK 256
|
||||
|
||||
// TODO: headers for prototypes and .c .cpp .cu files for implementations
|
||||
template <typename E, typename S>
|
||||
__global__ void vectorModMult(S *scalar_vec, E *element_vec, E *result, size_t n_elments)
|
||||
{
|
||||
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid < n_elments)
|
||||
{
|
||||
result[tid] = scalar_vec[tid] * element_vec[tid];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
int vector_mod_mult(S *vec_a, E *vec_b, E *result, size_t n_elments) // TODO: in place so no need for third result vector
|
||||
{
|
||||
// Set the grid and block dimensions
|
||||
int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
|
||||
int threads_per_block = MAX_THREADS_PER_BLOCK;
|
||||
|
||||
// Allocate memory on the device for the input vectors, the output vector, and the modulus
|
||||
S *d_vec_a;
|
||||
E *d_vec_b, *d_result;
|
||||
cudaMalloc(&d_vec_a, n_elments * sizeof(S));
|
||||
cudaMalloc(&d_vec_b, n_elments * sizeof(E));
|
||||
cudaMalloc(&d_result, n_elments * sizeof(E));
|
||||
|
||||
// Copy the input vectors and the modulus from the host to the device
|
||||
cudaMemcpy(d_vec_a, vec_a, n_elments * sizeof(S), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_vec_b, vec_b, n_elments * sizeof(E), cudaMemcpyHostToDevice);
|
||||
|
||||
// Call the kernel to perform element-wise modular multiplication
|
||||
vectorModMult<<<num_blocks, threads_per_block>>>(d_vec_a, d_vec_b, d_result, n_elments);
|
||||
|
||||
cudaMemcpy(result, d_result, n_elments * sizeof(E), cudaMemcpyDeviceToHost);
|
||||
|
||||
cudaFree(d_vec_a);
|
||||
cudaFree(d_vec_b);
|
||||
cudaFree(d_result);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename E>
|
||||
__global__ void matrixVectorMult(E *matrix_elements, E *vector_elements, E *result, size_t dim)
|
||||
{
|
||||
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid < dim)
|
||||
{
|
||||
result[tid] = E::zero();
|
||||
for (int i = 0; i < dim; i++)
|
||||
result[tid] = result[tid] + matrix_elements[tid * dim + i] * vector_elements[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename E>
|
||||
int matrix_mod_mult(E *matrix_elements, E *vector_elements, E *result, size_t dim)
|
||||
{
|
||||
// Set the grid and block dimensions
|
||||
int num_blocks = (int)ceil((float)dim / MAX_THREADS_PER_BLOCK);
|
||||
int threads_per_block = MAX_THREADS_PER_BLOCK;
|
||||
|
||||
// Allocate memory on the device for the input vectors, the output vector, and the modulus
|
||||
E *d_matrix, *d_vector, *d_result;
|
||||
cudaMalloc(&d_matrix, (dim * dim) * sizeof(E));
|
||||
cudaMalloc(&d_vector, dim * sizeof(E));
|
||||
cudaMalloc(&d_result, dim * sizeof(E));
|
||||
|
||||
// Copy the input vectors and the modulus from the host to the device
|
||||
cudaMemcpy(d_matrix, matrix_elements, (dim * dim) * sizeof(E), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_vector, vector_elements, dim * sizeof(E), cudaMemcpyHostToDevice);
|
||||
|
||||
// Call the kernel to perform element-wise modular multiplication
|
||||
matrixVectorMult<<<num_blocks, threads_per_block>>>(d_matrix, d_vector, d_result, dim);
|
||||
|
||||
cudaMemcpy(result, d_result, dim * sizeof(E), cudaMemcpyDeviceToHost);
|
||||
|
||||
cudaFree(d_matrix);
|
||||
cudaFree(d_vector);
|
||||
cudaFree(d_result);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
__global__ void batch_vector_mult_kernel(E *element_vec, S *mult_vec, unsigned n_mult, unsigned batch_size)
|
||||
{
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid < n_mult * batch_size)
|
||||
{
|
||||
int mult_id = tid % n_mult;
|
||||
element_vec[tid] = mult_vec[mult_id] * element_vec[tid];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
int batch_vector_mult_template(E *element_vec, S *mult_vec, unsigned n_mult, unsigned batch_size)
|
||||
{
|
||||
// Set the grid and block dimensions
|
||||
int NUM_THREADS = MAX_THREADS_PER_BLOCK;
|
||||
int NUM_BLOCKS = (n_mult * batch_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
|
||||
// Allocate memory on the device for the input vectors, the output vector, and the modulus
|
||||
S *d_mult_vec;
|
||||
E *d_element_vec;
|
||||
size_t n_mult_size = n_mult * sizeof(S);
|
||||
size_t full_size = n_mult * batch_size * sizeof(E);
|
||||
cudaMalloc(&d_mult_vec, n_mult_size);
|
||||
cudaMalloc(&d_element_vec, full_size);
|
||||
|
||||
// Copy the input vectors and the modulus from the host to the device
|
||||
cudaMemcpy(d_mult_vec, mult_vec, n_mult_size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_element_vec, element_vec, full_size, cudaMemcpyHostToDevice);
|
||||
|
||||
batch_vector_mult_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(d_element_vec, d_mult_vec, n_mult, batch_size);
|
||||
|
||||
cudaMemcpy(element_vec, d_element_vec, full_size, cudaMemcpyDeviceToHost);
|
||||
|
||||
cudaFree(d_mult_vec);
|
||||
cudaFree(d_element_vec);
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C" int32_t batch_vector_mult_proj_cuda(projective_t *inout,
|
||||
scalar_t *scalar_vec,
|
||||
size_t n_scalars,
|
||||
size_t batch_size,
|
||||
size_t device_id)
|
||||
{
|
||||
try
|
||||
{
|
||||
// TODO: device_id
|
||||
batch_vector_mult_template(inout, scalar_vec, n_scalars, batch_size);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what()); // TODO: error code and message
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int32_t batch_vector_mult_scalar_cuda(scalar_t *inout,
|
||||
scalar_t *mult_vec,
|
||||
size_t n_mult,
|
||||
size_t batch_size,
|
||||
size_t device_id)
|
||||
{
|
||||
try
|
||||
{
|
||||
// TODO: device_id
|
||||
batch_vector_mult_template(inout, mult_vec, n_mult, batch_size);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
printf("error %s", ex.what()); // TODO: error code and message
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int32_t vec_mod_mult_point(projective_t *inout,
|
||||
scalar_t *scalar_vec,
|
||||
size_t n_elments,
|
||||
@@ -28,7 +191,7 @@ extern "C" int32_t vec_mod_mult_point(projective_t *inout,
|
||||
extern "C" int32_t vec_mod_mult_scalar(scalar_t *inout,
|
||||
scalar_t *scalar_vec,
|
||||
size_t n_elments,
|
||||
size_t device_id)
|
||||
size_t device_id) //TODO: unify with batch mult as batch_size=1
|
||||
{
|
||||
try
|
||||
{
|
||||
|
||||
@@ -1,110 +1,15 @@
|
||||
#pragma once
|
||||
#include <stdexcept>
|
||||
#include <cuda.h>
|
||||
#include "../../primitives/field.cuh"
|
||||
#include "../../curves/curve_config.cuh"
|
||||
#include "../../primitives/projective.cuh"
|
||||
|
||||
|
||||
#define MAX_THREADS_PER_BLOCK 256
|
||||
|
||||
// TODO: headers for prototypes and .c .cpp .cu files for implementations
|
||||
template <typename E, typename S>
|
||||
__global__ void vectorModMult(S *scalar_vec, E *element_vec, E *result, size_t n_elments)
|
||||
{
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid < n_elments)
|
||||
{
|
||||
result[tid] = scalar_vec[tid] * element_vec[tid];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
int vector_mod_mult(S *vec_a, E *vec_b, E *result, size_t n_elments) // TODO: in place so no need for third result vector
|
||||
{
|
||||
// Set the grid and block dimensions
|
||||
int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
|
||||
int threads_per_block = MAX_THREADS_PER_BLOCK;
|
||||
|
||||
// Allocate memory on the device for the input vectors, the output vector, and the modulus
|
||||
S *d_vec_a;
|
||||
E *d_vec_b, *d_result;
|
||||
cudaMalloc(&d_vec_a, n_elments * sizeof(S));
|
||||
cudaMalloc(&d_vec_b, n_elments * sizeof(E));
|
||||
cudaMalloc(&d_result, n_elments * sizeof(E));
|
||||
|
||||
// Copy the input vectors and the modulus from the host to the device
|
||||
cudaMemcpy(d_vec_a, vec_a, n_elments * sizeof(S), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_vec_b, vec_b, n_elments * sizeof(E), cudaMemcpyHostToDevice);
|
||||
|
||||
// Call the kernel to perform element-wise modular multiplication
|
||||
vectorModMult<<<num_blocks, threads_per_block>>>(d_vec_a, d_vec_b, d_result, n_elments);
|
||||
|
||||
cudaMemcpy(result, d_result, n_elments * sizeof(E), cudaMemcpyDeviceToHost);
|
||||
|
||||
cudaFree(d_vec_a);
|
||||
cudaFree(d_vec_b);
|
||||
cudaFree(d_result);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
__global__ void batchVectorMult(S *scalar_vec, E *element_vec, unsigned n_scalars, unsigned batch_size)
|
||||
{
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid < n_scalars * batch_size)
|
||||
{
|
||||
int scalar_id = tid % n_scalars;
|
||||
element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
int batch_vector_mult(S *scalar_vec, E *element_vec, unsigned n_scalars, unsigned batch_size)
|
||||
{
|
||||
// Set the grid and block dimensions
|
||||
int NUM_THREADS = MAX_THREADS_PER_BLOCK;
|
||||
int NUM_BLOCKS = (n_scalars * batch_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
batchVectorMult<<<NUM_BLOCKS, NUM_THREADS>>>(scalar_vec, element_vec, n_scalars, batch_size);
|
||||
return 0;
|
||||
}
|
||||
int vector_mod_mult(S *scalar_vec, E *element_vec, E *result, size_t n_elments);
|
||||
|
||||
template <typename E>
|
||||
__global__ void matrixVectorMult(E *matrix_elements, E *vector_elements, E *result, size_t dim)
|
||||
{
|
||||
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid < dim)
|
||||
{
|
||||
result[tid] = E::zero();
|
||||
for (int i = 0; i < dim; i++)
|
||||
result[tid] = result[tid] + matrix_elements[tid * dim + i] * vector_elements[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename E>
|
||||
int matrix_mod_mult(E *matrix_elements, E *vector_elements, E *result, size_t dim)
|
||||
{
|
||||
// Set the grid and block dimensions
|
||||
int num_blocks = (int)ceil((float)dim / MAX_THREADS_PER_BLOCK);
|
||||
int threads_per_block = MAX_THREADS_PER_BLOCK;
|
||||
|
||||
// Allocate memory on the device for the input vectors, the output vector, and the modulus
|
||||
E *d_matrix, *d_vector, *d_result;
|
||||
cudaMalloc(&d_matrix, (dim * dim) * sizeof(E));
|
||||
cudaMalloc(&d_vector, dim * sizeof(E));
|
||||
cudaMalloc(&d_result, dim * sizeof(E));
|
||||
|
||||
// Copy the input vectors and the modulus from the host to the device
|
||||
cudaMemcpy(d_matrix, matrix_elements, (dim * dim) * sizeof(E), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_vector, vector_elements, dim * sizeof(E), cudaMemcpyHostToDevice);
|
||||
|
||||
// Call the kernel to perform element-wise modular multiplication
|
||||
matrixVectorMult<<<num_blocks, threads_per_block>>>(d_matrix, d_vector, d_result, dim);
|
||||
|
||||
cudaMemcpy(result, d_result, dim * sizeof(E), cudaMemcpyDeviceToHost);
|
||||
|
||||
cudaFree(d_matrix);
|
||||
cudaFree(d_vector);
|
||||
cudaFree(d_result);
|
||||
|
||||
return 0;
|
||||
}
|
||||
int matrix_mod_mult(E *matrix_elements, E *vector_elements, E *result, size_t dim);
|
||||
|
||||
@@ -26,8 +26,6 @@ template <class CONFIG> class Field {
|
||||
static constexpr HOST_INLINE Field omega(uint32_t log_size) {
|
||||
// Quick fix to linking issue, permanent fix will follow
|
||||
switch (log_size) {
|
||||
case 0:
|
||||
return Field { CONFIG::one };
|
||||
case 1:
|
||||
return Field { CONFIG::omega1 };
|
||||
case 2:
|
||||
@@ -99,8 +97,6 @@ template <class CONFIG> class Field {
|
||||
static constexpr HOST_INLINE Field omega_inv(uint32_t log_size) {
|
||||
// Quick fix to linking issue, permanent fix will follow
|
||||
switch (log_size) {
|
||||
case 0:
|
||||
return Field { CONFIG::one };
|
||||
case 1:
|
||||
return Field { CONFIG::omega_inv1 };
|
||||
case 2:
|
||||
@@ -245,7 +241,7 @@ template <class CONFIG> class Field {
|
||||
}
|
||||
|
||||
|
||||
// private:
|
||||
private:
|
||||
typedef storage<TLC> ff_storage;
|
||||
typedef storage<2*TLC> ff_wide_storage;
|
||||
|
||||
@@ -387,14 +383,6 @@ template <class CONFIG> class Field {
|
||||
return CARRY_OUT ? carry : 0;
|
||||
}
|
||||
|
||||
static constexpr HOST_INLINE uint32_t sub_limbs_partial_host(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs) {
|
||||
uint32_t carry = 0;
|
||||
host_math::carry_chain<2 * TLC, false, true> chain;
|
||||
for (unsigned i = 0; i < num_limbs; i++)
|
||||
r[i] = chain.sub(x[i], y[i], carry);
|
||||
return carry;
|
||||
}
|
||||
|
||||
template <bool CARRY_OUT, typename T> static constexpr HOST_DEVICE_INLINE uint32_t add_limbs(const T &xs, const T &ys, T &rs) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return add_sub_limbs_device<false, CARRY_OUT>(xs, ys, rs);
|
||||
@@ -419,17 +407,7 @@ template <class CONFIG> class Field {
|
||||
}
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void mul_n_msb(uint32_t *acc, const uint32_t *a, uint32_t bi, size_t n = TLC, size_t start_i = 0) {
|
||||
#pragma unroll
|
||||
for (size_t i = start_i; i < n; i += 2) {
|
||||
acc[i] = ptx::mul_lo(a[i], bi);
|
||||
acc[i + 1] = ptx::mul_hi(a[i], bi);
|
||||
}
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void cmad_n(uint32_t *acc, const uint32_t *a, uint32_t bi, size_t n = TLC) {
|
||||
// multiply scalar by vector
|
||||
// acc = acc + bi*A[::2]
|
||||
acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
|
||||
acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]);
|
||||
#pragma unroll
|
||||
@@ -439,21 +417,7 @@ template <class CONFIG> class Field {
|
||||
}
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void cmad_n_msb(uint32_t *acc, const uint32_t *a, uint32_t bi, size_t n = TLC, size_t a_start_idx=0) {
|
||||
// multiply scalar by vector
|
||||
// acc = acc + bi*A[::2]
|
||||
acc[a_start_idx] = ptx::mad_lo_cc(a[a_start_idx], bi, acc[a_start_idx]);
|
||||
acc[a_start_idx + 1] = ptx::madc_hi_cc(a[a_start_idx], bi, acc[a_start_idx + 1]);
|
||||
#pragma unroll
|
||||
for (size_t i = a_start_idx + 2; i < n; i += 2) {
|
||||
acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
|
||||
acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
|
||||
}
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void mad_row(uint32_t *odd, uint32_t *even, const uint32_t *a, uint32_t bi, size_t n = TLC) {
|
||||
// odd = odd + bi*A
|
||||
// even = even + bi*A
|
||||
cmad_n(odd, a + 1, bi, n - 2);
|
||||
odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0);
|
||||
odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0);
|
||||
@@ -461,16 +425,6 @@ template <class CONFIG> class Field {
|
||||
odd[n - 1] = ptx::addc(odd[n - 1], 0);
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void mad_row_msb(uint32_t *odd, uint32_t *even, const uint32_t *a, uint32_t bi, size_t n = TLC, size_t a_start_idx = 0) {
|
||||
// odd = odd + bi*A
|
||||
// even = even + bi*A
|
||||
cmad_n_msb(odd, a + 1, bi, n - 2, a_start_idx - 1);
|
||||
odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0);
|
||||
odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0);
|
||||
cmad_n_msb(even, a, bi, n, a_start_idx);
|
||||
odd[n - 1] = ptx::addc(odd[n - 1], 0);
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void multiply_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
const uint32_t *a = as.limbs;
|
||||
const uint32_t *b = bs.limbs;
|
||||
@@ -492,289 +446,13 @@ template <class CONFIG> class Field {
|
||||
even[i + 1] = ptx::addc(even[i + 1], 0);
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void mult_no_carry(uint32_t a, uint32_t b, uint32_t *r) {
|
||||
r[0] = ptx::mul_lo(a, b);
|
||||
r[1] = ptx::mul_hi(a, b);
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void ingo_multiply_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
const uint32_t *a = as.limbs;
|
||||
const uint32_t *b = bs.limbs;
|
||||
uint32_t *r = rs.limbs;
|
||||
uint32_t i, j;
|
||||
uint32_t *even = rs.limbs;
|
||||
__align__(8) uint32_t odd[2 * TLC];
|
||||
for (uint32_t i = 0; i < 2 * TLC; i++)
|
||||
{
|
||||
even[i] = 0;
|
||||
odd[i] = 0;
|
||||
}
|
||||
// first row special case, no carry in no carry out. split to non parts, even and odd.
|
||||
for (i = 0; i < TLC - 1; i+=2 )
|
||||
{
|
||||
mult_no_carry(b[0], a[i], &even[i]);
|
||||
mult_no_carry(b[0], a[i + 1], &odd[i]);
|
||||
}
|
||||
|
||||
// doing two rows at one loop
|
||||
for (i = 1; i < TLC - 1; i+=2)
|
||||
{
|
||||
// odd bi's
|
||||
// multiply accumulate even part of new row with odd part prev row (needs a carry)
|
||||
// // j = 0, no carry in, only carry out
|
||||
odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
|
||||
odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]);
|
||||
// for loop carry in carry out
|
||||
for (j = 2; j < TLC; j+=2) // 2, 4, 6
|
||||
{
|
||||
odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
|
||||
odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
|
||||
}
|
||||
odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
|
||||
|
||||
// multiply accumulate odd part of new row with even part prev row (doesnt need a carry)
|
||||
// j = 1, no carry in, only carry out
|
||||
even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
|
||||
even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]);
|
||||
// for loop carry in carry out
|
||||
for (j = 3; j < TLC; j+=2)
|
||||
{
|
||||
even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
|
||||
even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
|
||||
}
|
||||
|
||||
// even bi's
|
||||
// multiply accumulate even part of new row with even part of prev row // needs a carry
|
||||
// j = 0, no carry in, only carry out
|
||||
even[i + 1] = ptx::mad_lo_cc(a[0], b[i + 1], even[i + 1]);
|
||||
even[i + 2] = ptx::madc_hi_cc(a[0], b[i + 1], even[i + 2]);
|
||||
// for loop, carry in, carry out.
|
||||
for (j = 2; j < TLC; j+=2)
|
||||
{
|
||||
even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]);
|
||||
even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]);
|
||||
}
|
||||
even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry
|
||||
|
||||
// multiply accumulate odd part of new row with odd part of prev row
|
||||
// j = 1, no carry in, only carry out
|
||||
odd[i + 1] = ptx::mad_lo_cc(a[1], b[i + 1], odd[i + 1]);
|
||||
odd[i + 2] = ptx::madc_hi_cc(a[1], b[i + 1], odd[i + 2]);
|
||||
// for loop, carry in, carry out.
|
||||
for (j = 3; j < TLC; j+=2)
|
||||
{
|
||||
odd[i + j] = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]);
|
||||
odd[i + j + 1] = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
|
||||
odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]);
|
||||
// for loop carry in carry out
|
||||
for (j = 2; j < TLC; j+=2)
|
||||
{
|
||||
odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
|
||||
odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
|
||||
}
|
||||
odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
|
||||
|
||||
// multiply accumulate odd part of new row with even part prev row
|
||||
// j = 1, no carry in, only carry out
|
||||
even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
|
||||
even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]);
|
||||
// for loop carry in carry out
|
||||
for (j = 3; j < TLC; j+=2)
|
||||
{
|
||||
even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
|
||||
even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
|
||||
}
|
||||
|
||||
// add even and odd parts
|
||||
even[1] = ptx::add_cc(even[1], odd[0]);
|
||||
for (i = 1; i < 2 * TLC - 2; i++)
|
||||
even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
|
||||
even[i + 1] = ptx::addc(even[i + 1], 0);
|
||||
}
|
||||
|
||||
|
||||
static DEVICE_INLINE void ingo_msb_multiply_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
const uint32_t *a = as.limbs;
|
||||
const uint32_t *b = bs.limbs;
|
||||
uint32_t *r = rs.limbs;
|
||||
uint32_t i, j;
|
||||
uint32_t *even = rs.limbs;
|
||||
__align__(8) uint32_t odd[2 * TLC];
|
||||
for (uint32_t i = 0; i < 2 * TLC; i++)
|
||||
{
|
||||
even[i] = 0;
|
||||
odd[i] = 0;
|
||||
}
|
||||
// only last element from first row.
|
||||
mult_no_carry(b[0], a[TLC - 1], &odd[TLC - 2]);
|
||||
|
||||
// doing two rows at one loop
|
||||
#pragma unroll
|
||||
for (i = 1; i < TLC - 1; i+=2)
|
||||
{
|
||||
const uint32_t first_active_j = TLC - 1 - i;
|
||||
const uint32_t first_active_j_odd = first_active_j + (1 - (first_active_j % 2));
|
||||
const uint32_t first_active_j_even = first_active_j + first_active_j % 2 ;
|
||||
// odd bi's
|
||||
// multiply accumulate even part of new row with odd part prev row (needs a carry)
|
||||
// j = 0, no carry in, only carry out
|
||||
odd[first_active_j_even + i - 1] = ptx::mad_lo_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i - 1]);
|
||||
odd[first_active_j_even + i] = ptx::madc_hi_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i]);
|
||||
// for loop carry in carry out
|
||||
#pragma unroll
|
||||
for (j = first_active_j_even + 2; j < TLC; j+=2)
|
||||
{
|
||||
odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
|
||||
odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
|
||||
}
|
||||
odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
|
||||
|
||||
// multiply accumulate odd part of new row with even part prev row (doesnt need a carry)
|
||||
// j = 1, no carry in, only carry out
|
||||
even[i + first_active_j_odd] = ptx::mad_lo_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd]);
|
||||
even[i + first_active_j_odd + 1] = ptx::madc_hi_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd + 1]);
|
||||
// for loop carry in carry out
|
||||
#pragma unroll
|
||||
for (j = first_active_j_odd + 2; j < TLC; j+=2)
|
||||
{
|
||||
even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
|
||||
even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
|
||||
}
|
||||
|
||||
// even bi's
|
||||
uint32_t const first_active_j1 = TLC - 1 - (i + 1) ;
|
||||
uint32_t const first_active_j_odd1 = first_active_j1 + (1 - (first_active_j1 % 2));
|
||||
uint32_t const first_active_j_even1 = first_active_j1 + first_active_j1 % 2;
|
||||
// multiply accumulate even part of new row with even part of prev row // needs a carry
|
||||
// j = 0, no carry in, only carry out
|
||||
even[first_active_j_even1 + i + 1] = ptx::mad_lo_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 1]);
|
||||
even[first_active_j_even1 + i + 2] = ptx::madc_hi_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 2]);
|
||||
// for loop, carry in, carry out.
|
||||
#pragma unroll
|
||||
for (j = first_active_j_even1 + 2; j < TLC; j+=2)
|
||||
{
|
||||
even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]);
|
||||
even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]);
|
||||
}
|
||||
even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry
|
||||
|
||||
// multiply accumulate odd part of new row with odd part of prev row
|
||||
// j = 1, no carry in, only carry out
|
||||
odd[first_active_j_odd1 + i] = ptx::mad_lo_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i]);
|
||||
odd[first_active_j_odd1+ i + 1] = ptx::madc_hi_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i + 1]);
|
||||
// for loop, carry in, carry out.
|
||||
#pragma unroll
|
||||
for (j = first_active_j_odd1 + 2; j < TLC; j+=2)
|
||||
{
|
||||
odd[i + j] = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]);
|
||||
odd[i + j + 1] = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// last round, i = TLC - 1
|
||||
odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
|
||||
odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]);
|
||||
// for loop carry in carry out
|
||||
#pragma unroll
|
||||
for (j = 2; j < TLC; j+=2)
|
||||
{
|
||||
odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
|
||||
odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
|
||||
}
|
||||
odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
|
||||
|
||||
// multiply accumulate odd part of new row with even part prev row
|
||||
// j = 1, no carry in, only carry out
|
||||
even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
|
||||
even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]);
|
||||
// for loop carry in carry out
|
||||
#pragma unroll
|
||||
for (j = 3; j < TLC; j+=2)
|
||||
{
|
||||
even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
|
||||
even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
|
||||
}
|
||||
|
||||
// add even and odd parts
|
||||
even[1] = ptx::add_cc(even[1], odd[0]);
|
||||
#pragma unroll
|
||||
for (i = 1; i < 2 * TLC - 2; i++)
|
||||
even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
|
||||
even[i + 1] = ptx::addc(even[i + 1], 0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static DEVICE_INLINE void multiply_lsb_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
// r = a * b is correcrt for the first TLC + 1 digits. (not computing from TLC + 1 to 2*TLC - 2).
|
||||
const uint32_t *a = as.limbs;
|
||||
const uint32_t *b = bs.limbs;
|
||||
uint32_t *even = rs.limbs;
|
||||
__align__(8) uint32_t odd[2 * TLC - 2];
|
||||
mul_n(even, a, b[0]);
|
||||
mul_n(odd, a + 1, b[0]);
|
||||
mad_row(&even[2], &odd[0], a, b[1]);
|
||||
size_t i;
|
||||
#pragma unroll
|
||||
for (i = 2; i < TLC - 1; i += 2) {
|
||||
mad_row(&odd[i], &even[i], a, b[i], TLC - i + 2);
|
||||
mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC - i + 2);
|
||||
}
|
||||
|
||||
// merge |even| and |odd|
|
||||
even[1] = ptx::add_cc(even[1], odd[0]);
|
||||
for (i = 1; i < TLC + 1; i++)
|
||||
even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
|
||||
even[i + 1] = ptx::addc(even[i + 1], 0);
|
||||
}
|
||||
|
||||
static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
const uint32_t *a = as.limbs;
|
||||
const uint32_t *b = bs.limbs;
|
||||
uint32_t *even = rs.limbs;
|
||||
__align__(8) uint32_t odd[2 * TLC - 2];
|
||||
for (int i=0; i<2*TLC - 1; i++)
|
||||
{
|
||||
even[i] = 0;
|
||||
odd[i] = 0;
|
||||
}
|
||||
uint32_t min_indexes_sum = TLC - 1;
|
||||
// only diagonal
|
||||
mul_n_msb(even, a, b[0], TLC, min_indexes_sum);
|
||||
mul_n_msb(odd, a + 1, b[0], TLC, min_indexes_sum - 1);
|
||||
mad_row_msb(&even[2], &odd[0], a, b[1], TLC, min_indexes_sum - 1);
|
||||
size_t i;
|
||||
#pragma unroll
|
||||
for (i = 2; i < TLC - 1; i += 2) {
|
||||
mad_row(&odd[i], &even[i], a, b[i]);
|
||||
mad_row(&even[i + 2], &odd[i], a, b[i + 1]);
|
||||
}
|
||||
// merge |even| and |odd|
|
||||
even[1] = ptx::add_cc(even[1], odd[0]);
|
||||
for (i = 1; i < 2 * TLC - 2; i++)
|
||||
even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
|
||||
even[i + 1] = ptx::addc(even[i + 1], 0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static HOST_INLINE void multiply_raw_host(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
const uint32_t *a = as.limbs;
|
||||
const uint32_t *b = bs.limbs;
|
||||
uint32_t *r = rs.limbs;
|
||||
for (unsigned i = 0; i < TLC; i++) {
|
||||
uint32_t carry = 0;
|
||||
for (unsigned j = 0; j < TLC; j++)
|
||||
for (unsigned j = 0; j < TLC; j++)
|
||||
r[j + i] = host_math::madc_cc(a[j], b[i], r[j + i], carry);
|
||||
r[TLC + i] = carry;
|
||||
}
|
||||
@@ -788,22 +466,6 @@ template <class CONFIG> class Field {
|
||||
#endif
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE void multiply_raw_lsb(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return multiply_lsb_raw_device(as, bs, rs);
|
||||
#else
|
||||
return multiply_raw_host(as, bs, rs);
|
||||
#endif
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE void multiply_raw_msb(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return ingo_msb_multiply_raw_device(as, bs, rs);
|
||||
#else
|
||||
return multiply_raw_host(as, bs, rs);
|
||||
#endif
|
||||
}
|
||||
|
||||
public:
|
||||
ff_storage limbs_storage;
|
||||
|
||||
@@ -875,47 +537,20 @@ template <class CONFIG> class Field {
|
||||
return rs;
|
||||
}
|
||||
|
||||
static constexpr DEVICE_INLINE uint32_t sub_limbs_partial_device(uint32_t *x, uint32_t *y, uint32_t *r, uint32_t num_limbs) {
|
||||
r[0] = ptx::sub_cc(x[0], y[0]);
|
||||
#pragma unroll
|
||||
for (unsigned i = 1; i < num_limbs; i++)
|
||||
r[i] = ptx::subc_cc(x[i], y[i]);
|
||||
return ptx::subc(0, 0);
|
||||
}
|
||||
|
||||
static constexpr HOST_DEVICE_INLINE uint32_t sub_limbs_partial(uint32_t *x, uint32_t *y, uint32_t *r, uint32_t num_limbs) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return sub_limbs_partial_device(x, y, r, num_limbs);
|
||||
#else
|
||||
return sub_limbs_partial_host(x, y, r, num_limbs);
|
||||
#endif
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys) {
|
||||
//printf("operator* called \n");
|
||||
wide xy = mul_wide(xs, ys); // full mult
|
||||
Field xy_hi = xy.get_higher_with_slack(); // xy << slack_bits
|
||||
wide xy = mul_wide(xs, ys);
|
||||
Field xy_hi = xy.get_higher_with_slack();
|
||||
wide l = {};
|
||||
multiply_raw_msb(xy_hi.limbs_storage, get_m(), l.limbs_storage); // MSB mult
|
||||
multiply_raw(xy_hi.limbs_storage, get_m(), l.limbs_storage);
|
||||
Field l_hi = l.get_higher_with_slack();
|
||||
wide lp = {};
|
||||
multiply_raw_lsb(l_hi.limbs_storage, get_modulus(), lp.limbs_storage); // LSB mult
|
||||
wide r_wide = xy - lp;
|
||||
multiply_raw(l_hi.limbs_storage, get_modulus(), lp.limbs_storage);
|
||||
wide r_wide = xy - lp;
|
||||
wide r_wide_reduced = {};
|
||||
// uint32_t reduced = sub_limbs<true>(r_wide.limbs_storage, modulus_wide(), r_wide_reduced.limbs_storage);
|
||||
// r_wide = reduced ? r_wide : r_wide_reduced;
|
||||
for (unsigned i = 0; i < TLC + 1; i++)
|
||||
{
|
||||
uint32_t carry = sub_limbs_partial(r_wide.limbs_storage.limbs, modulus_wide().limbs, r_wide_reduced.limbs_storage.limbs, TLC + 1);
|
||||
if (carry == 0) // continue to reduce
|
||||
r_wide = r_wide_reduced;
|
||||
else // done
|
||||
break;
|
||||
}
|
||||
|
||||
// number of wrap around is bounded by TLC + 1 times.
|
||||
uint32_t reduced = sub_limbs<true>(r_wide.limbs_storage, modulus_wide(), r_wide_reduced.limbs_storage);
|
||||
r_wide = reduced ? r_wide : r_wide_reduced;
|
||||
Field r = r_wide.get_lower();
|
||||
return (r);
|
||||
return reduce<1>(r);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Field& xs, const Field& ys) {
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "test_kernels.cuh"
|
||||
#include <iostream>
|
||||
#include <boost/multiprecision/cpp_int.hpp>
|
||||
namespace mp = boost::multiprecision;
|
||||
|
||||
|
||||
template <class T>
|
||||
int device_populate_random(T* d_elements, unsigned n) {
|
||||
@@ -21,19 +20,9 @@ int device_set(T* d_elements, T el, unsigned n) {
|
||||
return cudaMemcpy(d_elements, h_elements, sizeof(T) * n, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
mp::int1024_t convert_to_boost_mp(uint32_t *a, uint32_t length)
|
||||
{
|
||||
mp::int1024_t res = 0;
|
||||
for (uint32_t i = 0; i < length; i++)
|
||||
{
|
||||
res += (mp::int1024_t)(a[i]) << 32 * i;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
class PrimitivesTest : public ::testing::Test {
|
||||
protected:
|
||||
static const unsigned n = 1 << 10;
|
||||
static const unsigned n = 1 << 5;
|
||||
|
||||
proj *points1{};
|
||||
proj *points2{};
|
||||
@@ -47,8 +36,6 @@ protected:
|
||||
proj *res_points2{};
|
||||
scalar_field *res_scalars1{};
|
||||
scalar_field *res_scalars2{};
|
||||
scalar_field::wide *res_scalars_wide{};
|
||||
scalar_field::wide *res_scalars_wide_full{};
|
||||
|
||||
PrimitivesTest() {
|
||||
assert(!cudaDeviceReset());
|
||||
@@ -64,9 +51,6 @@ protected:
|
||||
assert(!cudaMallocManaged(&res_points2, n * sizeof(proj)));
|
||||
assert(!cudaMallocManaged(&res_scalars1, n * sizeof(scalar_field)));
|
||||
assert(!cudaMallocManaged(&res_scalars2, n * sizeof(scalar_field)));
|
||||
assert(!cudaMallocManaged(&res_scalars_wide, n * sizeof(scalar_field::wide)));
|
||||
assert(!cudaMallocManaged(&res_scalars_wide_full, n * sizeof(scalar_field::wide)));
|
||||
|
||||
}
|
||||
|
||||
~PrimitivesTest() override {
|
||||
@@ -82,10 +66,6 @@ protected:
|
||||
cudaFree(res_points2);
|
||||
cudaFree(res_scalars1);
|
||||
cudaFree(res_scalars2);
|
||||
|
||||
cudaFree(res_scalars_wide);
|
||||
cudaFree(res_scalars_wide_full);
|
||||
|
||||
cudaDeviceReset();
|
||||
}
|
||||
|
||||
@@ -102,9 +82,6 @@ protected:
|
||||
ASSERT_EQ(cudaMemset(res_points2, 0, n * sizeof(proj)), cudaSuccess);
|
||||
ASSERT_EQ(cudaMemset(res_scalars1, 0, n * sizeof(scalar_field)), cudaSuccess);
|
||||
ASSERT_EQ(cudaMemset(res_scalars2, 0, n * sizeof(scalar_field)), cudaSuccess);
|
||||
|
||||
ASSERT_EQ(cudaMemset(res_scalars_wide, 0, n * sizeof(scalar_field::wide)), cudaSuccess);
|
||||
ASSERT_EQ(cudaMemset(res_scalars_wide_full, 0, n * sizeof(scalar_field::wide)), cudaSuccess);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -278,189 +255,6 @@ TEST_F(PrimitivesTest, ECMixedAdditionOfNegatedPointEqSubtraction) {
|
||||
ASSERT_EQ(res_points1[i], points1[i] + res_points2[i]);
|
||||
}
|
||||
|
||||
TEST_F(PrimitivesTest, MP_LSB_MULT) {
|
||||
// LSB multiply, check correctness of first TLC + 1 digits result.
|
||||
ASSERT_EQ(mp_lsb_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
|
||||
std::cout << "first GPU lsb mult output = 0x";
|
||||
for (int i=0; i<2*scalar_field::TLC; i++)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i];
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
|
||||
ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
|
||||
std::cout << "first GPU full mult output = 0x";
|
||||
for (int i=0; i<2*scalar_field::TLC; i++)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i];
|
||||
}
|
||||
std::cout << std::endl;
|
||||
for (int j = 0; j < n; j++)
|
||||
{
|
||||
for (int i=0; i<scalar_field::TLC + 1; i++)
|
||||
{
|
||||
ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(PrimitivesTest, MP_MSB_MULT) {
|
||||
// MSB multiply, take n msb bits of multiplication, assert that the error is up to 1.
|
||||
ASSERT_EQ(mp_msb_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
|
||||
std::cout << "first GPU msb mult output = 0x";
|
||||
for (int i=2*scalar_field::TLC - 1; i >=0 ; i--)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
|
||||
ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
|
||||
std::cout << "first GPU full mult output = 0x";
|
||||
for (int i=2*scalar_field::TLC - 1; i >=0 ; i--)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " ";
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
|
||||
for (int i=0; i < 2*scalar_field::TLC - 1; i++)
|
||||
{
|
||||
if (res_scalars_wide_full[0].limbs_storage.limbs[i] == res_scalars_wide[0].limbs_storage.limbs[i])
|
||||
std::cout << "matched word idx = " << i << std::endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TEST_F(PrimitivesTest, INGO_MP_MULT) {
|
||||
// MSB multiply, take n msb bits of multiplication, assert that the error is up to 1.
|
||||
ASSERT_EQ(ingo_mp_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
|
||||
std::cout << "INGO = 0x";
|
||||
for (int i=0; i < 2*scalar_field::TLC ; i++)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
|
||||
ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
|
||||
std::cout << "ZKSYNC = 0x";
|
||||
for (int i=0; i < 2*scalar_field::TLC ; i++)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " ";
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
|
||||
for (int i=0; i < 2*scalar_field::TLC - 1; i++)
|
||||
{
|
||||
if (res_scalars_wide_full[0].limbs_storage.limbs[i] == res_scalars_wide[0].limbs_storage.limbs[i])
|
||||
std::cout << "matched word idx = " << i << std::endl;
|
||||
}
|
||||
for (int j=0; j<n; j++)
|
||||
{
|
||||
for (int i=0; i < 2*scalar_field::TLC - 1; i++)
|
||||
{
|
||||
ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
TEST_F(PrimitivesTest, INGO_MP_MSB_MULT) {
|
||||
// MSB multiply, take n msb bits of multiplication, assert that the error is up to 1.
|
||||
ASSERT_EQ(ingo_mp_msb_mult(scalars1, scalars2, res_scalars_wide, n), cudaSuccess);
|
||||
std::cout << "INGO MSB = 0x";
|
||||
for (int i=2*scalar_field::TLC - 1; i >= 0 ; i--)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
|
||||
std::cout << "ZKSYNC = 0x";
|
||||
for (int i=2*scalar_field::TLC - 1; i >= 0 ; i--)
|
||||
{
|
||||
std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " ";
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
|
||||
|
||||
// for (int i=scalar_field::TLC; i < 2*scalar_field::TLC - 1; i++)
|
||||
// {
|
||||
// ASSERT_EQ(in_bound, true);
|
||||
// }
|
||||
// for (int j=0; j<n; j++)
|
||||
// {
|
||||
// for (int i=0; i < 2*scalar_field::TLC - 1; i++)
|
||||
// {
|
||||
// ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]);
|
||||
// }
|
||||
// }
|
||||
// mp testing
|
||||
mp::int1024_t scalar_1_mp = 0;
|
||||
mp::int1024_t scalar_2_mp = 0;
|
||||
mp::int1024_t res_mp = 0;
|
||||
mp::int1024_t res_gpu = 0;
|
||||
uint32_t num_limbs = scalar_field::TLC;
|
||||
|
||||
for (int j=0; j<n; j++)
|
||||
{
|
||||
uint32_t* scalar1_limbs = scalars1[j].limbs_storage.limbs;
|
||||
uint32_t* scalar2_limbs = scalars2[j].limbs_storage.limbs;
|
||||
scalar_1_mp = convert_to_boost_mp(scalar1_limbs, num_limbs);
|
||||
scalar_2_mp = convert_to_boost_mp(scalar2_limbs, num_limbs);
|
||||
res_mp = scalar_1_mp * scalar_2_mp;
|
||||
res_mp = res_mp >> (num_limbs * 32);
|
||||
res_gpu = convert_to_boost_mp(&(res_scalars_wide[j]).limbs_storage.limbs[num_limbs], num_limbs);
|
||||
std::cout << "res mp = " << res_mp << std::endl;
|
||||
std::cout << "res gpu = " << res_gpu << std::endl;
|
||||
std::cout << "error = " << res_mp - res_gpu << std::endl;
|
||||
bool upper_bound = res_gpu <= res_mp;
|
||||
bool lower_bound = res_gpu > (res_mp - num_limbs);
|
||||
bool in_bound = upper_bound && lower_bound;
|
||||
|
||||
|
||||
ASSERT_EQ(in_bound, true);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(PrimitivesTest, INGO_MP_MOD_MULT) {
|
||||
std::cout << " taking num limbs " << std::endl;
|
||||
uint32_t num_limbs = scalar_field::TLC;
|
||||
std::cout << " calling gpu... = " << std::endl;
|
||||
ASSERT_EQ(ingo_mp_mod_mult(scalars1, scalars2, res_scalars1, n), cudaSuccess);
|
||||
std::cout << " gpu call done " << std::endl;
|
||||
// mp testing
|
||||
mp::int1024_t scalar_1_mp = 0;
|
||||
mp::int1024_t scalar_2_mp = 0;
|
||||
mp::int1024_t res_mp = 0;
|
||||
mp::int1024_t res_gpu = 0;
|
||||
mp::int1024_t p = convert_to_boost_mp(scalar_field::get_modulus().limbs, num_limbs);
|
||||
std::cout << " p = " << p << std::endl;
|
||||
|
||||
|
||||
for (int j=0; j<n; j++)
|
||||
{
|
||||
uint32_t* scalar1_limbs = scalars1[j].limbs_storage.limbs;
|
||||
uint32_t* scalar2_limbs = scalars2[j].limbs_storage.limbs;
|
||||
scalar_1_mp = convert_to_boost_mp(scalar1_limbs, num_limbs);
|
||||
scalar_2_mp = convert_to_boost_mp(scalar2_limbs, num_limbs);
|
||||
// std::cout << " s1 = " << scalar_1_mp << std::endl;
|
||||
// std::cout << " s2 = " << scalar_2_mp << std::endl;
|
||||
res_mp = (scalar_1_mp * scalar_2_mp) % p;
|
||||
res_gpu = convert_to_boost_mp((res_scalars1[j]).limbs_storage.limbs, num_limbs);
|
||||
std::cout << "res mp = " << res_mp << std::endl;
|
||||
std::cout << "res gpu = " << res_gpu << std::endl;
|
||||
std::cout << "error = " << res_mp - res_gpu << std::endl;
|
||||
ASSERT_EQ(res_gpu, res_mp);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
|
||||
@@ -105,87 +105,3 @@ int point_vec_to_affine(const proj *x, affine *result, const unsigned count) {
|
||||
int error = cudaGetLastError();
|
||||
return error ? error : cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
__global__ void mp_mult_kernel(const scalar_field *x, const scalar_field *y, scalar_field::wide *result) {
|
||||
const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
scalar_field::multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
|
||||
}
|
||||
|
||||
|
||||
int mp_mult(const scalar_field *x, scalar_field *y, scalar_field::wide *result)
|
||||
{
|
||||
mp_mult_kernel<<<1, 32>>>(x, y, result);
|
||||
int error = cudaGetLastError();
|
||||
return error ? error : cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
|
||||
__global__ void mp_lsb_mult_kernel(const scalar_field *x, const scalar_field *y, scalar_field::wide *result) {
|
||||
const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
scalar_field::multiply_lsb_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
|
||||
}
|
||||
|
||||
|
||||
int mp_lsb_mult(const scalar_field *x, scalar_field *y, scalar_field::wide *result)
|
||||
{
|
||||
mp_lsb_mult_kernel<<<1, 32>>>(x, y, result);
|
||||
int error = cudaGetLastError();
|
||||
return error ? error : cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
__global__ void mp_msb_mult_kernel(const scalar_field *x, const scalar_field *y, scalar_field::wide *result) {
|
||||
const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
scalar_field::multiply_msb_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
|
||||
}
|
||||
|
||||
|
||||
int mp_msb_mult(const scalar_field *x, scalar_field *y, scalar_field::wide *result)
|
||||
{
|
||||
mp_msb_mult_kernel<<<1, 1>>>(x, y, result);
|
||||
int error = cudaGetLastError();
|
||||
return error ? error : cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
__global__ void ingo_mp_mult_kernel(const scalar_field *x, const scalar_field *y, scalar_field::wide *result) {
|
||||
const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
scalar_field::ingo_multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
|
||||
}
|
||||
|
||||
|
||||
int ingo_mp_mult(const scalar_field *x, scalar_field *y, scalar_field::wide *result)
|
||||
{
|
||||
ingo_mp_mult_kernel<<<1, 32>>>(x, y, result);
|
||||
int error = cudaGetLastError();
|
||||
return error ? error : cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
__global__ void ingo_mp_msb_mult_kernel(const scalar_field *x, const scalar_field *y, scalar_field::wide *result) {
|
||||
const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
scalar_field::ingo_msb_multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
|
||||
}
|
||||
|
||||
|
||||
int ingo_mp_msb_mult(const scalar_field *x, scalar_field *y, scalar_field::wide *result, const unsigned n)
|
||||
{
|
||||
ingo_mp_msb_mult_kernel<<<1, n>>>(x, y, result);
|
||||
int error = cudaGetLastError();
|
||||
return error ? error : cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
__global__ void ingo_mp_mod_mult_kernel(const scalar_field *x, const scalar_field *y, scalar_field *result) {
|
||||
const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
result[gid] = x[gid] * y[gid];
|
||||
}
|
||||
|
||||
|
||||
int ingo_mp_mod_mult(const scalar_field *x, scalar_field *y, scalar_field *result, const unsigned n)
|
||||
{
|
||||
ingo_mp_mod_mult_kernel<<<1, n>>>(x, y, result);
|
||||
int error = cudaGetLastError();
|
||||
return error ? error : cudaDeviceSynchronize();
|
||||
}
|
||||
122
src/field.rs
122
src/field.rs
@@ -1,12 +1,8 @@
|
||||
use std::ffi::c_uint;
|
||||
use std::mem::transmute;
|
||||
|
||||
use ark_bls12_381::{Fq, G1Affine, G1Projective};
|
||||
use ark_ec::AffineCurve;
|
||||
use ark_ff::{BigInteger384, BigInteger256, PrimeField};
|
||||
|
||||
use rustacuda_core::DeviceCopy;
|
||||
use rustacuda_derive::DeviceCopy;
|
||||
use ark_ff::{BigInteger384, PrimeField};
|
||||
|
||||
use crate::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
|
||||
|
||||
@@ -16,8 +12,6 @@ pub struct Field<const NUM_LIMBS: usize> {
|
||||
pub s: [u32; NUM_LIMBS],
|
||||
}
|
||||
|
||||
unsafe impl<const NUM_LIMBS: usize> DeviceCopy for Field<NUM_LIMBS> {}
|
||||
|
||||
impl<const NUM_LIMBS: usize> Default for Field<NUM_LIMBS> {
|
||||
fn default() -> Self {
|
||||
Field::zero()
|
||||
@@ -93,25 +87,9 @@ impl ScalarField {
|
||||
pub fn limbs(&self) -> [u32; SCALAR_LIMBS] {
|
||||
self.s
|
||||
}
|
||||
|
||||
pub fn to_ark(&self) -> BigInteger256 {
|
||||
BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
|
||||
}
|
||||
|
||||
pub fn from_ark(ark: BigInteger256) -> Self {
|
||||
Self::from_limbs(&u64_vec_to_u32_vec(&ark.0))
|
||||
}
|
||||
|
||||
pub fn to_ark_transmute(&self) -> BigInteger256 {
|
||||
unsafe { transmute(*self) }
|
||||
}
|
||||
|
||||
pub fn from_ark_transmute(v: BigInteger256) -> ScalarField {
|
||||
unsafe { transmute(v) }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, DeviceCopy)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
#[repr(C)]
|
||||
pub struct Point {
|
||||
pub x: BaseField,
|
||||
@@ -179,7 +157,7 @@ impl PartialEq for Point {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Clone, Copy, DeviceCopy)]
|
||||
#[derive(Debug, PartialEq, Clone, Copy)]
|
||||
#[repr(C)]
|
||||
pub struct PointAffineNoInfinity {
|
||||
pub x: BaseField,
|
||||
@@ -298,12 +276,102 @@ impl ScalarField {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Clone, Copy)]
|
||||
#[repr(C)]
|
||||
pub struct Scalar {
|
||||
pub s: ScalarField, //TODO: do we need this wrapping struct?
|
||||
}
|
||||
|
||||
impl Scalar {
|
||||
pub fn from_limbs_le(value: &[u32]) -> Scalar {
|
||||
Scalar {
|
||||
s: ScalarField::from_limbs(value),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_limbs_be(value: &[u32]) -> Scalar {
|
||||
let mut value = value.to_vec();
|
||||
value.reverse();
|
||||
Self::from_limbs_le(&value)
|
||||
}
|
||||
|
||||
pub fn limbs(&self) -> Vec<u32> {
|
||||
self.s.limbs().to_vec()
|
||||
}
|
||||
|
||||
pub fn one() -> Self {
|
||||
Scalar {
|
||||
s: ScalarField::one(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn zero() -> Self {
|
||||
Scalar {
|
||||
s: ScalarField::zero(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Scalar {
|
||||
fn default() -> Self {
|
||||
Scalar {
|
||||
s: ScalarField::zero(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use ark_bls12_381::Fr;
|
||||
use std::mem::transmute;
|
||||
|
||||
use crate::{utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec}, field::{Point, ScalarField}};
|
||||
use ark_bls12_381::Fr;
|
||||
use ark_ff::{BigInteger256, PrimeField};
|
||||
|
||||
use crate::{utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec}, field::Point};
|
||||
|
||||
use super::{Scalar, ScalarField};
|
||||
|
||||
impl ScalarField {
|
||||
pub fn to_ark(&self) -> BigInteger256 {
|
||||
BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
|
||||
}
|
||||
|
||||
pub fn from_ark(ark: BigInteger256) -> Self {
|
||||
Self::from_limbs(&u64_vec_to_u32_vec(&ark.0))
|
||||
}
|
||||
|
||||
pub fn to_ark_transmute(&self) -> BigInteger256 {
|
||||
unsafe { transmute(*self) }
|
||||
}
|
||||
|
||||
pub fn from_ark_transmute(v: BigInteger256) -> ScalarField {
|
||||
unsafe { transmute(v) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Scalar {
|
||||
pub fn to_ark_mod_p(&self) -> Fr {
|
||||
Fr::new(self.s.to_ark())
|
||||
}
|
||||
|
||||
pub fn to_ark_repr(&self) -> Fr {
|
||||
Fr::from_repr(self.s.to_ark()).unwrap()
|
||||
}
|
||||
|
||||
pub fn to_ark_transmute(&self) -> Fr {
|
||||
unsafe { std::mem::transmute(*self) }
|
||||
}
|
||||
|
||||
pub fn from_ark_transmute(v: &Fr) -> Scalar {
|
||||
unsafe { std::mem::transmute_copy(v) }
|
||||
}
|
||||
|
||||
pub fn from_ark(v: &Fr) -> Scalar {
|
||||
Scalar {
|
||||
s: ScalarField::from_ark(v.into_repr()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ark_scalar_convert() {
|
||||
|
||||
1015
src/lib.rs
1015
src/lib.rs
File diff suppressed because it is too large
Load Diff
@@ -65,12 +65,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_u64_vec_to_u32_vec() {
|
||||
let arr_u64 = [
|
||||
0x2FFFFFFF00000001,
|
||||
0x4FFFFFFF00000003,
|
||||
0x6FFFFFFF00000005,
|
||||
0x8FFFFFFF00000007,
|
||||
];
|
||||
let arr_u64 = [0x2FFFFFFF00000001, 0x4FFFFFFF00000003, 0x6FFFFFFF00000005, 0x8FFFFFFF00000007];
|
||||
|
||||
let arr_u32_expected = [1, 0x2fffffff, 3, 0x4fffffff, 5, 0x6fffffff, 7, 0x8fffffff];
|
||||
|
||||
|
||||
Reference in New Issue
Block a user