Add montgomery to vec_ops and example of that

This commit is contained in:
danny-shterman
2024-07-29 06:39:49 +00:00
parent 53f34aade5
commit 7e2b42f756
8 changed files with 766 additions and 12 deletions

View File

@@ -0,0 +1,25 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(example LANGUAGES CUDA CXX)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
add_executable(
example
example.cu
)
target_include_directories(example PRIVATE "../../../icicle/include")
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
target_link_libraries(example ${NVML_LIBRARY})
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

View File

@@ -0,0 +1,42 @@
# Icicle example: Montgomery vector operations (mul, add, sub) for allpossible options:
is_a_on_device
is_b_on_device
is_result_on_device
is_in_montgomery_form
(is_async isn't checked)
## Best-Practices
We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
## Key-Takeaway
`Icicle` accelerates multiplication operation `*` using [Karatsuba algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
## Concise Usage Explanation
Define field to be used, e. g.:
```c++
#include "api/bn254.h"
```
```c++
using namespace bn254;
typedef scalar_t T;
```
## Running the example
- `cd` to your example directory
- compile with `./compile.sh`
- run with `./run.sh`
## What's in the example
1. Define the parameters for the example such as vector size
2. Generate random vectors on-host
3. Copy them on-device
4. Execute element-wise vector multiplication on-device
5. Copy results on-host

View File

@@ -0,0 +1,15 @@
#!/bin/bash
# Exit immediately on error
set -e
mkdir -p build/example
mkdir -p build/icicle
# Configure and build Icicle
cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Debug -DCURVE=bn254
cmake --build build/icicle -j
# Configure and build the example application
cmake -DCMAKE_BUILD_TYPE=Debug -S . -B build/example
cmake --build build/example

View File

@@ -0,0 +1,15 @@
#!/bin/bash
# Exit immediately on error
set -e
mkdir -p build/example
mkdir -p build/icicle
# Configure and build Icicle
cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
cmake --build build/icicle -j
# Configure and build the example application
cmake -S . -B build/example
cmake --build build/example

View File

@@ -0,0 +1,607 @@
#include <iostream>
#include <iomanip>
#include <chrono>
#include <nvml.h>
#include "api/bn254.h"
#include "vec_ops/vec_ops.cuh"
#include <vec_ops/../../include/utils/mont.cuh>
using namespace vec_ops;
using namespace bn254;
typedef scalar_t T;
enum Op {
MUL,
ADD,
SUB,
LAST
};
// bn254 p = 21888242871839275222246405745257275088548364400416034343698204186575808495617
int vector_op(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config, Op op)
{
cudaError_t err;
switch (op) {
case MUL: err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result); break;
case ADD: err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result); break;
case SUB: err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result); break;
}
// cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result);
if (err != cudaSuccess) {
std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
return 0;
}
return 0;
}
int vector_mul(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
{
cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result);
if (err != cudaSuccess) {
std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
return 0;
}
return 0;
}
int vector_add(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
{
cudaError_t err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result);
if (err != cudaSuccess) {
std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
return 0;
}
return 0;
}
int vector_sub(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
{
cudaError_t err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result);
if (err != cudaSuccess) {
std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
return 0;
}
return 0;
}
int main(int argc, char** argv)
{
const unsigned vector_size = 1 << 0;
const unsigned repetitions = 1 << 0;
cudaError_t err;
nvmlInit();
nvmlDevice_t device;
nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0
std::cout << "Icicle-Examples: vector mul / add / sub operations." << std::endl;
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
if (nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE) == NVML_SUCCESS) {
std::cout << "GPU Model: " << name << std::endl;
} else {
std::cerr << "Failed to get GPU model name." << std::endl;
}
unsigned power_limit;
nvmlDeviceGetPowerManagementLimit(device, &power_limit);
std::cout << "Vector size: " << vector_size << std::endl;
std::cout << "Repetitions: " << repetitions << std::endl;
std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
unsigned int baseline_power;
nvmlDeviceGetPowerUsage(device, &baseline_power);
std::cout << "Baseline power: " << std::fixed << std::setprecision(3) << 1.0e-3 * baseline_power << " W" << std::endl;
unsigned baseline_temperature;
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &baseline_temperature) == NVML_SUCCESS) {
std::cout << "Baseline GPU Temperature: " << baseline_temperature << " C" << std::endl;
} else {
std::cerr << "Failed to get GPU temperature." << std::endl;
}
// host data
std::cout << "Allocate memory for the input vectors (both normal and Montgomery presentation)" << std::endl;
T* host_in1_init = (T*)malloc(vector_size * sizeof(T));
T* host_in2_init = (T*)malloc(vector_size * sizeof(T));
std::cout << "Initializing vectors with normal presentation random data" << std::endl;
T::rand_host_many(host_in1_init, vector_size);
T::rand_host_many(host_in2_init, vector_size);
std::cout << "Allocate memory for the output vectors" << std::endl;
T* host_out = (T*)malloc(vector_size * sizeof(T)); // This memory will be used for the test output.
T* host_out_ref_mul = (T*)malloc(vector_size * sizeof(T)); // This memory will be used as a reference result for mul (will be compared to host_out content).
T* host_out_ref_add = (T*)malloc(vector_size * sizeof(T)); // This memory will be used as a reference result for add (will be compared to host_out content).
T* host_out_ref_sub = (T*)malloc(vector_size * sizeof(T)); // This memory will be used as a reference result for sub (will be compared to host_out content).
std::cout << "Initializing output vectors with random data" << std::endl;
T::rand_host_many(host_out, vector_size);
T::rand_host_many(host_out_ref_mul, vector_size);
T::rand_host_many(host_out_ref_add, vector_size);
T::rand_host_many(host_out_ref_sub, vector_size);
// device data
device_context::DeviceContext ctx = device_context::get_default_device_context();
T* device_in1;
T* device_in2;
T* device_out;
err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
if (err != cudaSuccess) {
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
}
err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
if (err != cudaSuccess) {
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
}
err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
if (err != cudaSuccess) {
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
}
vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
//****************************************
// Test warn-up and reference output config. Reference output to be used to check if test passed or not.
//****************************************
// copy from host to device
err = cudaMemcpy(device_in1, host_in1_init, vector_size * sizeof(T), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
return 0;
}
err = cudaMemcpy(device_in2, host_in2_init, vector_size * sizeof(T), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
return 0;
}
std::cout << "Starting warm-up run" << std::endl;
// Warm-up loop
for ( int op = MUL; op != LAST; op++ ) {
for (int i = 0; i < repetitions; i++) {
// vector_mul(device_in1, device_in2, device_out, vector_size, ctx, config);
vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
}
switch (op) {
case MUL: err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break;
case ADD: err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break;
case SUB: err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break;
}
}
// copy the result from device to host_out_ref_mul to keep it for later comparisons.
// err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
return 0;
}
//****************************************
// End of test warn-up and reference output config.
//****************************************
std::cout << "Starting benchmarking" << std::endl;
unsigned power_before;
nvmlDeviceGetPowerUsage(device, &power_before);
std::cout << "Power before: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_before << " W" << std::endl;
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_before / power_limit
<< " %" << std::endl;
unsigned temperature_before;
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_before) == NVML_SUCCESS) {
std::cout << "GPU Temperature before: " << temperature_before << " C" << std::endl;
} else {
std::cerr << "Failed to get GPU temperature." << std::endl;
}
//*******************************************************
// Benchmark test:
// Loop for (mul, add, sub):
// Loop (is_a_on_device, is_b_on_device, is_result_on_device, is_in_montgomery_form):
//*******************************************************
T* host_in1 = (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in1_init for the benchmark.
T* host_in2 = (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in2_init for the benchmark.
// Test when the result is not in-place
for ( int op = MUL; op != LAST; op++ ) {
// for (int config_idx = 0; config_idx < 0; config_idx++) {
for (int config_idx = 0; config_idx < 16; config_idx++) {
std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl;
for (int i=0; i<vector_size; i++) {
host_in1[i] = host_in1_init[i];
host_in2[i] = host_in2_init[i];
}
config.is_a_on_device = (config_idx >> 3) & 0x1;
config.is_b_on_device = (config_idx >> 2) & 0x1;
config.is_result_on_device = (config_idx >> 1) & 0x1;
config.is_in_montgomery_form = (config_idx >> 0) & 0x1;
// Copy from host to device (copy again in order to be used later in the loop and device_inX was already overwritten by warmup.
if (config.is_a_on_device) {
if (config.is_in_montgomery_form) {
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place.
} else { // Normal presentation.
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
}
} else {
if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1));
err = cudaMemcpy(host_in1, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from device_in1 to host_in1 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
}
}
if (config.is_b_on_device) {
if (config.is_in_montgomery_form) {
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place.
} else {
// Normal presentation.
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
}
} else {
if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2));
err = cudaMemcpy(host_in2, device_in2, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from device_in2 to host_in2 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
}
}
CHK_IF_RETURN(cudaPeekAtLastError());
auto start_time = std::chrono::high_resolution_clock::now();
// Benchmark loop
for (int i = 0; i < repetitions; i++) {
switch (config_idx >> 1) { // {is_a_on_device, is_b_on_device, is_result_on_device}
case 0b000: vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); break;
case 0b001: vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); break;
case 0b010: vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); break;
case 0b011: vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); break;
case 0b100: vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); break;
case 0b101: vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); break;
case 0b110: vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); break;
case 0b111: vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); break;
}
CHK_IF_RETURN(cudaPeekAtLastError());
}
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
switch (op) {
case MUL: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx " << config_idx << " and result not in-place" << std::endl; break;
case ADD: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx " << config_idx << " and result not in-place" << std::endl; break;
case SUB: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx " << config_idx << " and result not in-place" << std::endl; break;
}
if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value.
if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed.
CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
}
err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
return 0;
}
} else { // Data is not on device but it is in host_out.
if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and written back to host. Then compared vs. host_out_ref_mul.
err = cudaMemcpy(device_out, host_out, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl;
return 0;
}
CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
return 0;
}
} else { // host_out could be compared vs. host_out_ref_mul as is.
}
}
//****************************************
// End of benchmark test.
//****************************************
//***********************************************
// Test result check
// Check is performed by executing the operation in a normal presentation
// (located in in host_out_ref_mul) and comparing it with the
// benchmark test result.
//***********************************************
int test_failed = 0;
// std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl;
// std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl;
switch (op) {
case MUL: for (int i=0; i<vector_size; i++) {
if (host_out_ref_mul[i] != host_out[i]) {
std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i << ", config is printed below:" << std::endl;
test_failed = 1;
}
}
break;
case ADD: for (int i=0; i<vector_size; i++) {
if (host_out_ref_add[i] != host_out[i]) {
std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i << ", config is printed below:" << std::endl;
test_failed = 1;
}
}
break;
case SUB: for (int i=0; i<vector_size; i++) {
if (host_out_ref_sub[i] != host_out[i]) {
std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i << ", config is printed below:" << std::endl;
test_failed = 1;
}
}
break;
}
if (test_failed) {
// std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << std::endl;
std::cout << "===>>> result is not in-place: " << std::endl;
std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl;
std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl;
std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl;
std::cout << "===>>> is_in_montgomery_form: " << config.is_in_montgomery_form << std::endl;
exit(2);
}
unsigned power_after;
nvmlDeviceGetPowerUsage(device, &power_after);
std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
<< " %" << std::endl;
unsigned temperature_after;
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
} else {
std::cerr << "Failed to get GPU temperature." << std::endl;
}
// Report performance in GMPS: Giga Multiplications Per Second
double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
}
}
// Test when the result is in-place
for ( int op = MUL; op != LAST; op++ ) {
for (int config_idx = 0; config_idx < 16; config_idx++) {
for (int i=0; i<vector_size; i++) {
host_in1[i] = host_in1_init[i];
host_in2[i] = host_in2_init[i];
}
config.is_a_on_device = (config_idx >> 4) & 0x1;
config.is_b_on_device = (config_idx >> 3) & 0x1;
config.is_result_on_device = (config_idx >> 2) & 0x1;
config.is_in_montgomery_form = (config_idx >> 1) & 0x1;
if (config.is_a_on_device ^ config.is_result_on_device == 1) {
continue;
}
// Copy from host to device (copy again in order to be used later in the loop and device_inX was already overwritten by warmup.
if (config.is_a_on_device) {
if (config.is_in_montgomery_form) {
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place.
} else { // Normal presentation.
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
}
} else {
if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1));
err = cudaMemcpy(host_in1, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from device_in1 to host_in1 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
}
}
if (config.is_b_on_device) {
if (config.is_in_montgomery_form) {
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place.
} else {
// Normal presentation.
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
}
} else {
if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2));
err = cudaMemcpy(host_in2, device_in2, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from device_in2 to host_in2 - " << cudaGetErrorString(err) << std::endl;
return 0;
}
}
}
CHK_IF_RETURN(cudaPeekAtLastError());
auto start_time = std::chrono::high_resolution_clock::now();
// Benchmark loop
for (int i = 0; i < repetitions; i++) {
switch (config_idx >> 2) { // {is_a_on_device, is_b_on_device, is_result_on_device}
case 0b000: vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); break;
case 0b001: break;
case 0b010: vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); break;
case 0b011: break;
case 0b100: break;
case 0b101: vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); break;
case 0b110: break;
case 0b111: vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); break;
}
CHK_IF_RETURN(cudaPeekAtLastError());
}
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
switch (op) {
case MUL: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx " << config_idx << " and result in-place" << std::endl; break;
case ADD: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx " << config_idx << " and result in-place" << std::endl; break;
case SUB: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx " << config_idx << " and result in-place" << std::endl; break;
}
if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value.
if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed.
CHK_IF_RETURN(mont::from_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert to normal in order to check vs. host_out_ref_mul.
}
err = cudaMemcpy(host_out, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from device_in1 to host_out - " << cudaGetErrorString(err) << std::endl;
return 0;
}
} else { // Data is not on device but it is in host_in1. It should be moved to host_out for test pass/fail check.
if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and written back to host. Then compared vs. host_out_ref_mul.
err = cudaMemcpy(device_out, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl;
return 0;
}
CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
return 0;
}
} else { // host_out could be compared vs. host_out_ref_mul as is.
err = cudaMemcpy(device_out, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl;
return 0;
}
err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
return 0;
}
}
}
//****************************************
// End of benchmark test.
//****************************************
//***********************************************
// Test result check
// Check is performed by executing the operation in a normal presentation
// (located in in host_out_ref_mul) and comparing it with the
// benchmark test result.
//***********************************************
int test_failed = 0;
// std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl;
// std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl;
switch (op) {
case MUL: for (int i=0; i<vector_size; i++) {
if (host_out_ref_mul[i] != host_out[i]) {
std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i << ", config is printed below:" << std::endl;
std::cout << "host_out_ref_mul[0] = " << host_out_ref_mul[0] << std::endl;
test_failed = 1;
}
}
break;
case ADD: for (int i=0; i<vector_size; i++) {
if (host_out_ref_add[i] != host_out[i]) {
std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i << ", config is printed below:" << std::endl;
std::cout << "host_out_ref_add[0] = " << host_out_ref_add[0] << std::endl;
test_failed = 1;
}
}
break;
case SUB: for (int i=0; i<vector_size; i++) {
if (host_out_ref_sub[i] != host_out[i]) {
std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i << ", config is printed below:" << std::endl;
std::cout << "host_out_ref_sub[0] = " << host_out_ref_sub[0] << std::endl;
test_failed = 1;
}
}
break;
}
if (test_failed) {
// std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << std::endl;
std::cout << "===>>> result is in-place: " << std::endl;
std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl;
std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl;
std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl;
std::cout << "===>>> is_in_montgomery_form: " << config.is_in_montgomery_form << std::endl;
std::cout << "host_out[0] = " << host_out[0] << std::endl;
exit(2);
}
unsigned power_after;
nvmlDeviceGetPowerUsage(device, &power_after);
std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
<< " %" << std::endl;
unsigned temperature_after;
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
} else {
std::cerr << "Failed to get GPU temperature." << std::endl;
}
// Report performance in GMPS: Giga Multiplications Per Second
double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
}
}
// clean up and exit
free(host_in1_init);
free(host_in2_init);
free(host_in1);
free(host_in2);
free(host_out);
free(host_out_ref_mul);
cudaFree(device_in1);
cudaFree(device_in2);
cudaFree(device_out);
nvmlShutdown();
return 0;
}

View File

@@ -0,0 +1,2 @@
#!/bin/bash
./build/example/example

View File

@@ -27,6 +27,7 @@ namespace vec_ops {
* non-blocking and you'd need to synchronize it explicitly by running
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
* function will block the current CPU thread. */
bool is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form. Default value: false. */
};
/**
@@ -42,6 +43,7 @@ namespace vec_ops {
false, // is_b_on_device
false, // is_result_on_device
false, // is_async
false, // is_in_montgomery_form
};
return config;
}

View File

@@ -95,25 +95,62 @@ namespace vec_ops {
E *d_result, *d_alloc_vec_a, *d_alloc_vec_b;
E* d_vec_a;
const E* d_vec_b;
int is_d_alloc_vec_a_allocated = 0;
if (!config.is_a_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
d_vec_a = d_alloc_vec_a;
if (config.is_in_montgomery_form) {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a));
is_d_alloc_vec_a_allocated = 1;
d_vec_a = d_alloc_vec_a;
} else {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
is_d_alloc_vec_a_allocated = 1;
d_vec_a = d_alloc_vec_a;
}
} else {
d_vec_a = vec_a;
if (config.is_in_montgomery_form) {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input.
CHK_IF_RETURN(mont::from_montgomery(vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a));
is_d_alloc_vec_a_allocated = 1;
d_vec_a = d_alloc_vec_a;
} else {
d_vec_a = vec_a;
}
}
int is_d_alloc_vec_b_allocated = 0;
if (!config.is_b_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
d_vec_b = d_alloc_vec_b;
if (config.is_in_montgomery_form) {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b));
is_d_alloc_vec_b_allocated = 1;
d_vec_b = d_alloc_vec_b;
} else {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
is_d_alloc_vec_b_allocated = 1;
d_vec_b = d_alloc_vec_b;
}
} else {
d_vec_b = vec_b;
if (config.is_in_montgomery_form) {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input.
CHK_IF_RETURN(mont::from_montgomery(vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b));
is_d_alloc_vec_b_allocated = 1;
d_vec_b = d_alloc_vec_b;
} else {
d_vec_b = vec_b;
}
}
int is_d_result_allocated = 0;
if (!config.is_result_on_device) {
if (!is_in_place) {
CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
is_d_result_allocated = 1;
} else {
d_result = d_vec_a;
}
@@ -129,12 +166,21 @@ namespace vec_ops {
Kernel<<<num_blocks, num_threads, 0, config.ctx.stream>>>(d_vec_a, d_vec_b, n, d_result);
if (!config.is_result_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream));
if (config.is_in_montgomery_form) {
CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place.
CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
} else {
CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
}
} else {
if (config.is_in_montgomery_form) {
CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place.
}
}
if (!config.is_a_on_device && !is_in_place) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
if (is_d_alloc_vec_a_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
if (is_d_alloc_vec_b_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
if (is_d_result_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream)); }
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream));