mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-09 15:37:58 -05:00
Add montgomery to vec_ops and example of that
This commit is contained in:
25
examples/c++/mont_vec_ops/CMakeLists.txt
Normal file
25
examples/c++/mont_vec_ops/CMakeLists.txt
Normal file
@@ -0,0 +1,25 @@
|
||||
cmake_minimum_required(VERSION 3.18)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CUDA_STANDARD 17)
|
||||
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
|
||||
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
|
||||
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
|
||||
endif ()
|
||||
project(example LANGUAGES CUDA CXX)
|
||||
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
|
||||
set(CMAKE_CUDA_FLAGS_RELEASE "")
|
||||
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
|
||||
add_executable(
|
||||
example
|
||||
example.cu
|
||||
)
|
||||
target_include_directories(example PRIVATE "../../../icicle/include")
|
||||
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
|
||||
find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
|
||||
target_link_libraries(example ${NVML_LIBRARY})
|
||||
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
||||
|
||||
42
examples/c++/mont_vec_ops/README.md
Normal file
42
examples/c++/mont_vec_ops/README.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# Icicle example: Montgomery vector operations (mul, add, sub) for allpossible options:
|
||||
is_a_on_device
|
||||
is_b_on_device
|
||||
is_result_on_device
|
||||
is_in_montgomery_form
|
||||
(is_async isn't checked)
|
||||
|
||||
## Best-Practices
|
||||
|
||||
We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
|
||||
|
||||
## Key-Takeaway
|
||||
|
||||
`Icicle` accelerates multiplication operation `*` using [Karatsuba algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
|
||||
|
||||
## Concise Usage Explanation
|
||||
|
||||
Define field to be used, e. g.:
|
||||
|
||||
```c++
|
||||
#include "api/bn254.h"
|
||||
```
|
||||
|
||||
```c++
|
||||
using namespace bn254;
|
||||
typedef scalar_t T;
|
||||
```
|
||||
|
||||
## Running the example
|
||||
|
||||
- `cd` to your example directory
|
||||
- compile with `./compile.sh`
|
||||
- run with `./run.sh`
|
||||
|
||||
## What's in the example
|
||||
|
||||
1. Define the parameters for the example such as vector size
|
||||
2. Generate random vectors on-host
|
||||
3. Copy them on-device
|
||||
4. Execute element-wise vector multiplication on-device
|
||||
5. Copy results on-host
|
||||
|
||||
15
examples/c++/mont_vec_ops/compile.debug.sh
Executable file
15
examples/c++/mont_vec_ops/compile.debug.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Exit immediately on error
|
||||
set -e
|
||||
|
||||
mkdir -p build/example
|
||||
mkdir -p build/icicle
|
||||
|
||||
# Configure and build Icicle
|
||||
cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Debug -DCURVE=bn254
|
||||
cmake --build build/icicle -j
|
||||
|
||||
# Configure and build the example application
|
||||
cmake -DCMAKE_BUILD_TYPE=Debug -S . -B build/example
|
||||
cmake --build build/example
|
||||
15
examples/c++/mont_vec_ops/compile.sh
Executable file
15
examples/c++/mont_vec_ops/compile.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Exit immediately on error
|
||||
set -e
|
||||
|
||||
mkdir -p build/example
|
||||
mkdir -p build/icicle
|
||||
|
||||
# Configure and build Icicle
|
||||
cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
|
||||
cmake --build build/icicle -j
|
||||
|
||||
# Configure and build the example application
|
||||
cmake -S . -B build/example
|
||||
cmake --build build/example
|
||||
607
examples/c++/mont_vec_ops/example.cu
Normal file
607
examples/c++/mont_vec_ops/example.cu
Normal file
@@ -0,0 +1,607 @@
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <chrono>
|
||||
#include <nvml.h>
|
||||
|
||||
#include "api/bn254.h"
|
||||
#include "vec_ops/vec_ops.cuh"
|
||||
#include <vec_ops/../../include/utils/mont.cuh>
|
||||
|
||||
using namespace vec_ops;
|
||||
using namespace bn254;
|
||||
|
||||
typedef scalar_t T;
|
||||
|
||||
enum Op {
|
||||
MUL,
|
||||
ADD,
|
||||
SUB,
|
||||
LAST
|
||||
};
|
||||
|
||||
// bn254 p = 21888242871839275222246405745257275088548364400416034343698204186575808495617
|
||||
|
||||
int vector_op(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config, Op op)
|
||||
{
|
||||
cudaError_t err;
|
||||
switch (op) {
|
||||
case MUL: err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result); break;
|
||||
case ADD: err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result); break;
|
||||
case SUB: err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result); break;
|
||||
}
|
||||
// cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
int vector_mul(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
|
||||
{
|
||||
cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
int vector_add(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
|
||||
{
|
||||
cudaError_t err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
int vector_sub(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
|
||||
{
|
||||
cudaError_t err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
const unsigned vector_size = 1 << 0;
|
||||
const unsigned repetitions = 1 << 0;
|
||||
|
||||
cudaError_t err;
|
||||
nvmlInit();
|
||||
nvmlDevice_t device;
|
||||
nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0
|
||||
std::cout << "Icicle-Examples: vector mul / add / sub operations." << std::endl;
|
||||
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
|
||||
if (nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE) == NVML_SUCCESS) {
|
||||
std::cout << "GPU Model: " << name << std::endl;
|
||||
} else {
|
||||
std::cerr << "Failed to get GPU model name." << std::endl;
|
||||
}
|
||||
unsigned power_limit;
|
||||
nvmlDeviceGetPowerManagementLimit(device, &power_limit);
|
||||
|
||||
std::cout << "Vector size: " << vector_size << std::endl;
|
||||
std::cout << "Repetitions: " << repetitions << std::endl;
|
||||
std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
|
||||
|
||||
unsigned int baseline_power;
|
||||
nvmlDeviceGetPowerUsage(device, &baseline_power);
|
||||
std::cout << "Baseline power: " << std::fixed << std::setprecision(3) << 1.0e-3 * baseline_power << " W" << std::endl;
|
||||
unsigned baseline_temperature;
|
||||
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &baseline_temperature) == NVML_SUCCESS) {
|
||||
std::cout << "Baseline GPU Temperature: " << baseline_temperature << " C" << std::endl;
|
||||
} else {
|
||||
std::cerr << "Failed to get GPU temperature." << std::endl;
|
||||
}
|
||||
|
||||
// host data
|
||||
std::cout << "Allocate memory for the input vectors (both normal and Montgomery presentation)" << std::endl;
|
||||
T* host_in1_init = (T*)malloc(vector_size * sizeof(T));
|
||||
T* host_in2_init = (T*)malloc(vector_size * sizeof(T));
|
||||
std::cout << "Initializing vectors with normal presentation random data" << std::endl;
|
||||
T::rand_host_many(host_in1_init, vector_size);
|
||||
T::rand_host_many(host_in2_init, vector_size);
|
||||
std::cout << "Allocate memory for the output vectors" << std::endl;
|
||||
T* host_out = (T*)malloc(vector_size * sizeof(T)); // This memory will be used for the test output.
|
||||
T* host_out_ref_mul = (T*)malloc(vector_size * sizeof(T)); // This memory will be used as a reference result for mul (will be compared to host_out content).
|
||||
T* host_out_ref_add = (T*)malloc(vector_size * sizeof(T)); // This memory will be used as a reference result for add (will be compared to host_out content).
|
||||
T* host_out_ref_sub = (T*)malloc(vector_size * sizeof(T)); // This memory will be used as a reference result for sub (will be compared to host_out content).
|
||||
std::cout << "Initializing output vectors with random data" << std::endl;
|
||||
T::rand_host_many(host_out, vector_size);
|
||||
T::rand_host_many(host_out_ref_mul, vector_size);
|
||||
T::rand_host_many(host_out_ref_add, vector_size);
|
||||
T::rand_host_many(host_out_ref_sub, vector_size);
|
||||
// device data
|
||||
device_context::DeviceContext ctx = device_context::get_default_device_context();
|
||||
T* device_in1;
|
||||
T* device_in2;
|
||||
T* device_out;
|
||||
|
||||
err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
|
||||
|
||||
//****************************************
|
||||
// Test warn-up and reference output config. Reference output to be used to check if test passed or not.
|
||||
//****************************************
|
||||
// copy from host to device
|
||||
err = cudaMemcpy(device_in1, host_in1_init, vector_size * sizeof(T), cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
err = cudaMemcpy(device_in2, host_in2_init, vector_size * sizeof(T), cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
std::cout << "Starting warm-up run" << std::endl;
|
||||
// Warm-up loop
|
||||
for ( int op = MUL; op != LAST; op++ ) {
|
||||
for (int i = 0; i < repetitions; i++) {
|
||||
// vector_mul(device_in1, device_in2, device_out, vector_size, ctx, config);
|
||||
vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
|
||||
}
|
||||
switch (op) {
|
||||
case MUL: err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break;
|
||||
case ADD: err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break;
|
||||
case SUB: err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break;
|
||||
}
|
||||
}
|
||||
// copy the result from device to host_out_ref_mul to keep it for later comparisons.
|
||||
// err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
//****************************************
|
||||
// End of test warn-up and reference output config.
|
||||
//****************************************
|
||||
|
||||
std::cout << "Starting benchmarking" << std::endl;
|
||||
unsigned power_before;
|
||||
nvmlDeviceGetPowerUsage(device, &power_before);
|
||||
std::cout << "Power before: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_before << " W" << std::endl;
|
||||
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_before / power_limit
|
||||
<< " %" << std::endl;
|
||||
unsigned temperature_before;
|
||||
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_before) == NVML_SUCCESS) {
|
||||
std::cout << "GPU Temperature before: " << temperature_before << " C" << std::endl;
|
||||
} else {
|
||||
std::cerr << "Failed to get GPU temperature." << std::endl;
|
||||
}
|
||||
|
||||
//*******************************************************
|
||||
// Benchmark test:
|
||||
// Loop for (mul, add, sub):
|
||||
// Loop (is_a_on_device, is_b_on_device, is_result_on_device, is_in_montgomery_form):
|
||||
//*******************************************************
|
||||
T* host_in1 = (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in1_init for the benchmark.
|
||||
T* host_in2 = (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in2_init for the benchmark.
|
||||
// Test when the result is not in-place
|
||||
for ( int op = MUL; op != LAST; op++ ) {
|
||||
// for (int config_idx = 0; config_idx < 0; config_idx++) {
|
||||
for (int config_idx = 0; config_idx < 16; config_idx++) {
|
||||
std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl;
|
||||
for (int i=0; i<vector_size; i++) {
|
||||
host_in1[i] = host_in1_init[i];
|
||||
host_in2[i] = host_in2_init[i];
|
||||
}
|
||||
config.is_a_on_device = (config_idx >> 3) & 0x1;
|
||||
config.is_b_on_device = (config_idx >> 2) & 0x1;
|
||||
config.is_result_on_device = (config_idx >> 1) & 0x1;
|
||||
config.is_in_montgomery_form = (config_idx >> 0) & 0x1;
|
||||
|
||||
// Copy from host to device (copy again in order to be used later in the loop and device_inX was already overwritten by warmup.
|
||||
if (config.is_a_on_device) {
|
||||
if (config.is_in_montgomery_form) {
|
||||
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place.
|
||||
} else { // Normal presentation.
|
||||
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
|
||||
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1));
|
||||
err = cudaMemcpy(host_in1, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from device_in1 to host_in1 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (config.is_b_on_device) {
|
||||
if (config.is_in_montgomery_form) {
|
||||
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place.
|
||||
} else {
|
||||
// Normal presentation.
|
||||
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
|
||||
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2));
|
||||
err = cudaMemcpy(host_in2, device_in2, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from device_in2 to host_in2 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
CHK_IF_RETURN(cudaPeekAtLastError());
|
||||
|
||||
auto start_time = std::chrono::high_resolution_clock::now();
|
||||
// Benchmark loop
|
||||
for (int i = 0; i < repetitions; i++) {
|
||||
switch (config_idx >> 1) { // {is_a_on_device, is_b_on_device, is_result_on_device}
|
||||
case 0b000: vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); break;
|
||||
case 0b001: vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); break;
|
||||
case 0b010: vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); break;
|
||||
case 0b011: vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); break;
|
||||
case 0b100: vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); break;
|
||||
case 0b101: vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); break;
|
||||
case 0b110: vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); break;
|
||||
case 0b111: vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); break;
|
||||
}
|
||||
CHK_IF_RETURN(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
|
||||
switch (op) {
|
||||
case MUL: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx " << config_idx << " and result not in-place" << std::endl; break;
|
||||
case ADD: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx " << config_idx << " and result not in-place" << std::endl; break;
|
||||
case SUB: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx " << config_idx << " and result not in-place" << std::endl; break;
|
||||
}
|
||||
|
||||
if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value.
|
||||
if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed.
|
||||
CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
|
||||
}
|
||||
err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
} else { // Data is not on device but it is in host_out.
|
||||
if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and written back to host. Then compared vs. host_out_ref_mul.
|
||||
err = cudaMemcpy(device_out, host_out, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
|
||||
err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
} else { // host_out could be compared vs. host_out_ref_mul as is.
|
||||
}
|
||||
}
|
||||
//****************************************
|
||||
// End of benchmark test.
|
||||
//****************************************
|
||||
|
||||
//***********************************************
|
||||
// Test result check
|
||||
// Check is performed by executing the operation in a normal presentation
|
||||
// (located in in host_out_ref_mul) and comparing it with the
|
||||
// benchmark test result.
|
||||
//***********************************************
|
||||
int test_failed = 0;
|
||||
// std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl;
|
||||
// std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl;
|
||||
switch (op) {
|
||||
case MUL: for (int i=0; i<vector_size; i++) {
|
||||
if (host_out_ref_mul[i] != host_out[i]) {
|
||||
std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i << ", config is printed below:" << std::endl;
|
||||
test_failed = 1;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case ADD: for (int i=0; i<vector_size; i++) {
|
||||
if (host_out_ref_add[i] != host_out[i]) {
|
||||
std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i << ", config is printed below:" << std::endl;
|
||||
test_failed = 1;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case SUB: for (int i=0; i<vector_size; i++) {
|
||||
if (host_out_ref_sub[i] != host_out[i]) {
|
||||
std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i << ", config is printed below:" << std::endl;
|
||||
test_failed = 1;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (test_failed) {
|
||||
// std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << std::endl;
|
||||
std::cout << "===>>> result is not in-place: " << std::endl;
|
||||
std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl;
|
||||
std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl;
|
||||
std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl;
|
||||
std::cout << "===>>> is_in_montgomery_form: " << config.is_in_montgomery_form << std::endl;
|
||||
exit(2);
|
||||
}
|
||||
|
||||
unsigned power_after;
|
||||
nvmlDeviceGetPowerUsage(device, &power_after);
|
||||
std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
|
||||
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
|
||||
<< " %" << std::endl;
|
||||
unsigned temperature_after;
|
||||
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
|
||||
std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
|
||||
} else {
|
||||
std::cerr << "Failed to get GPU temperature." << std::endl;
|
||||
}
|
||||
|
||||
// Report performance in GMPS: Giga Multiplications Per Second
|
||||
double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
|
||||
std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Test when the result is in-place
|
||||
for ( int op = MUL; op != LAST; op++ ) {
|
||||
for (int config_idx = 0; config_idx < 16; config_idx++) {
|
||||
for (int i=0; i<vector_size; i++) {
|
||||
host_in1[i] = host_in1_init[i];
|
||||
host_in2[i] = host_in2_init[i];
|
||||
}
|
||||
config.is_a_on_device = (config_idx >> 4) & 0x1;
|
||||
config.is_b_on_device = (config_idx >> 3) & 0x1;
|
||||
config.is_result_on_device = (config_idx >> 2) & 0x1;
|
||||
config.is_in_montgomery_form = (config_idx >> 1) & 0x1;
|
||||
if (config.is_a_on_device ^ config.is_result_on_device == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Copy from host to device (copy again in order to be used later in the loop and device_inX was already overwritten by warmup.
|
||||
if (config.is_a_on_device) {
|
||||
if (config.is_in_montgomery_form) {
|
||||
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place.
|
||||
} else { // Normal presentation.
|
||||
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
|
||||
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1));
|
||||
err = cudaMemcpy(host_in1, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from device_in1 to host_in1 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (config.is_b_on_device) {
|
||||
if (config.is_in_montgomery_form) {
|
||||
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place.
|
||||
} else {
|
||||
// Normal presentation.
|
||||
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
|
||||
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2));
|
||||
err = cudaMemcpy(host_in2, device_in2, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from device_in2 to host_in2 - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
CHK_IF_RETURN(cudaPeekAtLastError());
|
||||
|
||||
auto start_time = std::chrono::high_resolution_clock::now();
|
||||
// Benchmark loop
|
||||
for (int i = 0; i < repetitions; i++) {
|
||||
switch (config_idx >> 2) { // {is_a_on_device, is_b_on_device, is_result_on_device}
|
||||
case 0b000: vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); break;
|
||||
case 0b001: break;
|
||||
case 0b010: vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); break;
|
||||
case 0b011: break;
|
||||
case 0b100: break;
|
||||
case 0b101: vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); break;
|
||||
case 0b110: break;
|
||||
case 0b111: vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); break;
|
||||
}
|
||||
CHK_IF_RETURN(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
|
||||
switch (op) {
|
||||
case MUL: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx " << config_idx << " and result in-place" << std::endl; break;
|
||||
case ADD: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx " << config_idx << " and result in-place" << std::endl; break;
|
||||
case SUB: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx " << config_idx << " and result in-place" << std::endl; break;
|
||||
}
|
||||
|
||||
if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value.
|
||||
if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed.
|
||||
CHK_IF_RETURN(mont::from_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert to normal in order to check vs. host_out_ref_mul.
|
||||
}
|
||||
err = cudaMemcpy(host_out, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from device_in1 to host_out - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
} else { // Data is not on device but it is in host_in1. It should be moved to host_out for test pass/fail check.
|
||||
if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and written back to host. Then compared vs. host_out_ref_mul.
|
||||
err = cudaMemcpy(device_out, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
|
||||
err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
} else { // host_out could be compared vs. host_out_ref_mul as is.
|
||||
err = cudaMemcpy(device_out, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
//****************************************
|
||||
// End of benchmark test.
|
||||
//****************************************
|
||||
|
||||
//***********************************************
|
||||
// Test result check
|
||||
// Check is performed by executing the operation in a normal presentation
|
||||
// (located in in host_out_ref_mul) and comparing it with the
|
||||
// benchmark test result.
|
||||
//***********************************************
|
||||
int test_failed = 0;
|
||||
// std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl;
|
||||
// std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl;
|
||||
switch (op) {
|
||||
case MUL: for (int i=0; i<vector_size; i++) {
|
||||
if (host_out_ref_mul[i] != host_out[i]) {
|
||||
std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i << ", config is printed below:" << std::endl;
|
||||
std::cout << "host_out_ref_mul[0] = " << host_out_ref_mul[0] << std::endl;
|
||||
test_failed = 1;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case ADD: for (int i=0; i<vector_size; i++) {
|
||||
if (host_out_ref_add[i] != host_out[i]) {
|
||||
std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i << ", config is printed below:" << std::endl;
|
||||
std::cout << "host_out_ref_add[0] = " << host_out_ref_add[0] << std::endl;
|
||||
test_failed = 1;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case SUB: for (int i=0; i<vector_size; i++) {
|
||||
if (host_out_ref_sub[i] != host_out[i]) {
|
||||
std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i << ", config is printed below:" << std::endl;
|
||||
std::cout << "host_out_ref_sub[0] = " << host_out_ref_sub[0] << std::endl;
|
||||
test_failed = 1;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (test_failed) {
|
||||
// std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << std::endl;
|
||||
std::cout << "===>>> result is in-place: " << std::endl;
|
||||
std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl;
|
||||
std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl;
|
||||
std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl;
|
||||
std::cout << "===>>> is_in_montgomery_form: " << config.is_in_montgomery_form << std::endl;
|
||||
std::cout << "host_out[0] = " << host_out[0] << std::endl;
|
||||
exit(2);
|
||||
}
|
||||
|
||||
unsigned power_after;
|
||||
nvmlDeviceGetPowerUsage(device, &power_after);
|
||||
std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
|
||||
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
|
||||
<< " %" << std::endl;
|
||||
unsigned temperature_after;
|
||||
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
|
||||
std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
|
||||
} else {
|
||||
std::cerr << "Failed to get GPU temperature." << std::endl;
|
||||
}
|
||||
|
||||
// Report performance in GMPS: Giga Multiplications Per Second
|
||||
double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
|
||||
std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// clean up and exit
|
||||
free(host_in1_init);
|
||||
free(host_in2_init);
|
||||
free(host_in1);
|
||||
free(host_in2);
|
||||
free(host_out);
|
||||
free(host_out_ref_mul);
|
||||
cudaFree(device_in1);
|
||||
cudaFree(device_in2);
|
||||
cudaFree(device_out);
|
||||
nvmlShutdown();
|
||||
return 0;
|
||||
}
|
||||
2
examples/c++/mont_vec_ops/run.sh
Executable file
2
examples/c++/mont_vec_ops/run.sh
Executable file
@@ -0,0 +1,2 @@
|
||||
#!/bin/bash
|
||||
./build/example/example
|
||||
@@ -27,6 +27,7 @@ namespace vec_ops {
|
||||
* non-blocking and you'd need to synchronize it explicitly by running
|
||||
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
|
||||
* function will block the current CPU thread. */
|
||||
bool is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form. Default value: false. */
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -42,6 +43,7 @@ namespace vec_ops {
|
||||
false, // is_b_on_device
|
||||
false, // is_result_on_device
|
||||
false, // is_async
|
||||
false, // is_in_montgomery_form
|
||||
};
|
||||
return config;
|
||||
}
|
||||
|
||||
@@ -95,25 +95,62 @@ namespace vec_ops {
|
||||
E *d_result, *d_alloc_vec_a, *d_alloc_vec_b;
|
||||
E* d_vec_a;
|
||||
const E* d_vec_b;
|
||||
|
||||
int is_d_alloc_vec_a_allocated = 0;
|
||||
if (!config.is_a_on_device) {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
|
||||
d_vec_a = d_alloc_vec_a;
|
||||
if (config.is_in_montgomery_form) {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
|
||||
CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a));
|
||||
is_d_alloc_vec_a_allocated = 1;
|
||||
d_vec_a = d_alloc_vec_a;
|
||||
} else {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
|
||||
is_d_alloc_vec_a_allocated = 1;
|
||||
d_vec_a = d_alloc_vec_a;
|
||||
}
|
||||
} else {
|
||||
d_vec_a = vec_a;
|
||||
if (config.is_in_montgomery_form) {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input.
|
||||
CHK_IF_RETURN(mont::from_montgomery(vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a));
|
||||
is_d_alloc_vec_a_allocated = 1;
|
||||
d_vec_a = d_alloc_vec_a;
|
||||
} else {
|
||||
d_vec_a = vec_a;
|
||||
}
|
||||
}
|
||||
|
||||
int is_d_alloc_vec_b_allocated = 0;
|
||||
if (!config.is_b_on_device) {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
|
||||
d_vec_b = d_alloc_vec_b;
|
||||
if (config.is_in_montgomery_form) {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
|
||||
CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b));
|
||||
is_d_alloc_vec_b_allocated = 1;
|
||||
d_vec_b = d_alloc_vec_b;
|
||||
} else {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
|
||||
is_d_alloc_vec_b_allocated = 1;
|
||||
d_vec_b = d_alloc_vec_b;
|
||||
}
|
||||
} else {
|
||||
d_vec_b = vec_b;
|
||||
if (config.is_in_montgomery_form) {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input.
|
||||
CHK_IF_RETURN(mont::from_montgomery(vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b));
|
||||
is_d_alloc_vec_b_allocated = 1;
|
||||
d_vec_b = d_alloc_vec_b;
|
||||
} else {
|
||||
d_vec_b = vec_b;
|
||||
}
|
||||
}
|
||||
|
||||
int is_d_result_allocated = 0;
|
||||
if (!config.is_result_on_device) {
|
||||
if (!is_in_place) {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
|
||||
is_d_result_allocated = 1;
|
||||
} else {
|
||||
d_result = d_vec_a;
|
||||
}
|
||||
@@ -129,12 +166,21 @@ namespace vec_ops {
|
||||
Kernel<<<num_blocks, num_threads, 0, config.ctx.stream>>>(d_vec_a, d_vec_b, n, d_result);
|
||||
|
||||
if (!config.is_result_on_device) {
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
|
||||
CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream));
|
||||
if (config.is_in_montgomery_form) {
|
||||
CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place.
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
|
||||
} else {
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
|
||||
}
|
||||
} else {
|
||||
if (config.is_in_montgomery_form) {
|
||||
CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place.
|
||||
}
|
||||
}
|
||||
|
||||
if (!config.is_a_on_device && !is_in_place) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
|
||||
if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
|
||||
if (is_d_alloc_vec_a_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
|
||||
if (is_d_alloc_vec_b_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
|
||||
if (is_d_result_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream)); }
|
||||
|
||||
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user