mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-09 03:55:04 -05:00
feat(cuda): implement lwe ciphertext / plaintext add in concrete-cuda
This commit is contained in:
@@ -24,3 +24,25 @@ void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition_plaintext(v_stream, gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(plaintext_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition_plaintext(v_stream, gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(plaintext_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -7,21 +7,38 @@
|
||||
#include <helper_cuda.h>
|
||||
#endif
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "linear_algebra.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <stdio.h>
|
||||
|
||||
template <typename T>
|
||||
__global__ void addition(T *output, T *input_1, T *input_2,
|
||||
uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
if (tid < num_entries) {
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
if (index < num_entries) {
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output[index] = input_1[index] + input_2[index];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int plaintext_index = blockIdx.x * blockDim.x + tid;
|
||||
if (plaintext_index < num_entries) {
|
||||
int index =
|
||||
plaintext_index * (input_lwe_dimension + 1) + input_lwe_dimension;
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output[index] = lwe_input[index] + plaintext_input[plaintext_index];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
|
||||
T *input_1, T *input_2,
|
||||
@@ -45,4 +62,29 @@ __host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
|
||||
cudaStreamSynchronize(*stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index,
|
||||
T *output, T *lwe_input,
|
||||
T *plaintext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
checkCudaErrors(cudaMemcpyAsync(output, lwe_input,
|
||||
(input_lwe_dimension + 1) *
|
||||
input_lwe_ciphertext_count * sizeof(T),
|
||||
cudaMemcpyDeviceToDevice, *stream));
|
||||
plaintext_addition<<<grid, thds, 0, *stream>>>(
|
||||
output, lwe_input, plaintext_input, input_lwe_dimension, num_entries);
|
||||
|
||||
cudaStreamSynchronize(*stream);
|
||||
}
|
||||
#endif // CUDA_ADD_H
|
||||
|
||||
@@ -14,8 +14,8 @@ template <typename T>
|
||||
__global__ void negation(T *output, T *input, uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
if (tid < num_entries) {
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
if (index < num_entries) {
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output[index] = -input[index];
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user