fix(concrete_cuda): fix buffer size type in the scratches, int is not big enough

This commit is contained in:
Agnes Leroy
2023-03-21 10:08:11 +01:00
committed by Agnès Leroy
parent 6556bee101
commit 573f4756fd
6 changed files with 111 additions and 107 deletions

View File

@@ -130,12 +130,11 @@ __global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value,
}
template <typename Torus>
__host__ __device__ int
get_buffer_size_extract_bits(uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size,
uint32_t number_of_inputs) {
__host__ __device__ uint64_t get_buffer_size_extract_bits(
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t number_of_inputs) {
int buffer_size =
uint64_t buffer_size =
sizeof(Torus) * number_of_inputs // lut_vector_indexes
+ ((glwe_dimension + 1) * polynomial_size) * sizeof(Torus) // lut_pbs
+ (glwe_dimension * polynomial_size + 1) *
@@ -159,7 +158,7 @@ scratch_extract_bits(void *v_stream, uint32_t gpu_index,
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
int buffer_size =
uint64_t buffer_size =
get_buffer_size_extract_bits<Torus>(glwe_dimension, lwe_dimension,
polynomial_size, number_of_inputs) +
get_buffer_size_bootstrap_low_latency<Torus>(

View File

@@ -212,9 +212,8 @@ __global__ void device_bootstrap_amortized(
}
template <typename Torus>
__host__ __device__ int
get_buffer_size_full_sm_bootstrap_amortized(uint32_t polynomial_size,
uint32_t glwe_dimension) {
__host__ __device__ uint64_t get_buffer_size_full_sm_bootstrap_amortized(
uint32_t polynomial_size, uint32_t glwe_dimension) {
return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
sizeof(Torus) * polynomial_size *
(glwe_dimension + 1) + // accumulator rotated
@@ -224,23 +223,23 @@ get_buffer_size_full_sm_bootstrap_amortized(uint32_t polynomial_size,
}
template <typename Torus>
__host__ __device__ int
__host__ __device__ uint64_t
get_buffer_size_partial_sm_bootstrap_amortized(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ int get_buffer_size_bootstrap_amortized(
__host__ __device__ uint64_t get_buffer_size_bootstrap_amortized(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
int full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
int partial_sm =
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
int partial_dm = full_sm - partial_sm;
int full_dm = full_sm;
int device_mem = 0;
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count;
} else if (max_shared_memory < full_sm) {
@@ -260,9 +259,9 @@ __host__ void scratch_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
int full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
int partial_sm =
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
@@ -279,7 +278,7 @@ __host__ void scratch_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
cudaFuncCachePreferShared));
}
if (allocate_gpu_memory) {
int buffer_size = get_buffer_size_bootstrap_amortized<Torus>(
uint64_t buffer_size = get_buffer_size_bootstrap_amortized<Torus>(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
@@ -297,15 +296,15 @@ __host__ void host_bootstrap_amortized(
uint32_t lwe_idx, uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
int SM_FULL = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
uint64_t SM_FULL = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
int SM_PART =
uint64_t SM_PART =
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
int DM_PART = SM_FULL - SM_PART;
uint64_t DM_PART = SM_FULL - SM_PART;
int DM_FULL = SM_FULL;
uint64_t DM_FULL = SM_FULL;
auto stream = static_cast<cudaStream_t *>(v_stream);

View File

@@ -135,7 +135,7 @@ __global__ void device_bootstrap_low_latency(
Torus *lwe_array_in, double2 *bootstrapping_key, double2 *join_buffer,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
int device_memory_size_per_block) {
uint64_t device_memory_size_per_block) {
grid_group grid = this_grid();
@@ -144,14 +144,15 @@ __global__ void device_bootstrap_low_latency(
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
int block_index =
blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
uint32_t glwe_dimension = gridDim.y - 1;
if constexpr (SMD == FULLSM)
if constexpr (SMD == FULLSM) {
selected_memory = sharedmem;
else
} else {
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
blockIdx.z * gridDim.x * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
// We always compute the pointer with most restrictive alignment to avoid
// alignment issues
@@ -245,7 +246,7 @@ __global__ void device_bootstrap_low_latency(
}
template <typename Torus>
__host__ __device__ int
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_low_latency(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
@@ -253,23 +254,23 @@ get_buffer_size_full_sm_bootstrap_low_latency(uint32_t polynomial_size) {
}
template <typename Torus>
__host__ __device__ int
__host__ __device__ uint64_t
get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus>
__host__ __device__ int get_buffer_size_bootstrap_low_latency(
__host__ __device__ uint64_t get_buffer_size_bootstrap_low_latency(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
int full_sm =
uint64_t full_sm =
get_buffer_size_full_sm_bootstrap_low_latency<Torus>(polynomial_size);
int partial_sm =
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
int partial_dm = full_sm - partial_sm;
int full_dm = full_sm;
int device_mem = 0;
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
@@ -277,9 +278,9 @@ __host__ __device__ int get_buffer_size_bootstrap_low_latency(
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
int buffer_size = device_mem + (glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2);
uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2);
return buffer_size + buffer_size % sizeof(double2);
}
@@ -292,9 +293,9 @@ __host__ void scratch_bootstrap_low_latency(
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
int full_sm =
uint64_t full_sm =
get_buffer_size_full_sm_bootstrap_low_latency<Torus>(polynomial_size);
int partial_sm =
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
check_cuda_error(cudaFuncSetAttribute(
@@ -313,7 +314,7 @@ __host__ void scratch_bootstrap_low_latency(
check_cuda_error(cudaGetLastError());
}
if (allocate_gpu_memory) {
int buffer_size = get_buffer_size_bootstrap_low_latency<Torus>(
uint64_t buffer_size = get_buffer_size_bootstrap_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
@@ -339,15 +340,15 @@ __host__ void host_bootstrap_low_latency(
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
int full_sm =
uint64_t full_sm =
get_buffer_size_full_sm_bootstrap_low_latency<Torus>(polynomial_size);
int partial_sm =
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
int full_dm = full_sm;
uint64_t full_dm = full_sm;
int partial_dm = full_dm - partial_sm;
uint64_t partial_dm = full_dm - partial_sm;
int8_t *d_mem = pbs_buffer;
double2 *buffer_fft =

View File

@@ -101,21 +101,23 @@ __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
}
template <typename Torus>
__host__ __device__ int
get_buffer_size_cbs(uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_inputs) {
__host__ __device__ uint64_t get_buffer_size_cbs(uint32_t glwe_dimension,
uint32_t lwe_dimension,
uint32_t polynomial_size,
uint32_t level_count_cbs,
uint32_t number_of_inputs) {
int buffer_size = number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension * polynomial_size + 1) *
sizeof(Torus) + // lwe_array_in_fp_ks_buffer
number_of_inputs * level_count_cbs *
(glwe_dimension * polynomial_size + 1) *
sizeof(Torus) + // lwe_array_out_pbs_buffer
number_of_inputs * level_count_cbs * (lwe_dimension + 1) *
sizeof(Torus) + // lwe_array_in_shifted_buffer
level_count_cbs * (glwe_dimension + 1) * polynomial_size *
sizeof(Torus); // lut_vector_cbs
uint64_t buffer_size =
number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension * polynomial_size + 1) *
sizeof(Torus) + // lwe_array_in_fp_ks_buffer
number_of_inputs * level_count_cbs *
(glwe_dimension * polynomial_size + 1) *
sizeof(Torus) + // lwe_array_out_pbs_buffer
number_of_inputs * level_count_cbs * (lwe_dimension + 1) *
sizeof(Torus) + // lwe_array_in_shifted_buffer
level_count_cbs * (glwe_dimension + 1) * polynomial_size *
sizeof(Torus); // lut_vector_cbs
return buffer_size + buffer_size % sizeof(double2);
}
@@ -132,7 +134,7 @@ __host__ void scratch_circuit_bootstrap(
int pbs_count = number_of_inputs * level_count_cbs;
// allocate and initialize device pointers for circuit bootstrap
if (allocate_gpu_memory) {
int buffer_size =
uint64_t buffer_size =
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
polynomial_size, level_count_cbs,
number_of_inputs) +

View File

@@ -210,9 +210,8 @@ __global__ void device_batch_cmux(Torus *glwe_array_out, Torus *glwe_array_in,
}
template <typename Torus>
__host__ __device__ int
get_memory_needed_per_block_cmux_tree(uint32_t glwe_dimension,
uint32_t polynomial_size) {
__host__ __device__ uint64_t get_memory_needed_per_block_cmux_tree(
uint32_t glwe_dimension, uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // glwe_sub
sizeof(double2) * polynomial_size / 2 *
(glwe_dimension + 1) + // res_fft
@@ -220,27 +219,27 @@ get_memory_needed_per_block_cmux_tree(uint32_t glwe_dimension,
}
template <typename Torus>
__host__ __device__ int
get_buffer_size_cmux_tree(uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t r, uint32_t tau,
uint32_t max_shared_memory) {
__host__ __device__ uint64_t get_buffer_size_cmux_tree(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t r, uint32_t tau, uint32_t max_shared_memory) {
int memory_needed_per_block = get_memory_needed_per_block_cmux_tree<Torus>(
glwe_dimension, polynomial_size);
int num_lut = (1 << r);
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
int glwe_size = (glwe_dimension + 1) * polynomial_size;
int device_mem = 0;
uint64_t memory_needed_per_block =
get_memory_needed_per_block_cmux_tree<Torus>(glwe_dimension,
polynomial_size);
uint64_t num_lut = (1 << r);
uint64_t ggsw_size = polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
uint64_t glwe_size = (glwe_dimension + 1) * polynomial_size;
uint64_t device_mem = 0;
if (max_shared_memory < memory_needed_per_block) {
device_mem = memory_needed_per_block * (1 << (r - 1)) * tau;
}
if (max_shared_memory < polynomial_size * sizeof(double)) {
device_mem += polynomial_size * sizeof(double);
}
int buffer_size = r * ggsw_size * sizeof(double) +
num_lut * tau * glwe_size * sizeof(Torus) +
num_lut * tau * glwe_size * sizeof(Torus) + device_mem;
uint64_t buffer_size = r * ggsw_size * sizeof(double) +
num_lut * tau * glwe_size * sizeof(Torus) +
num_lut * tau * glwe_size * sizeof(Torus) + device_mem;
return buffer_size + buffer_size % sizeof(double2);
}
@@ -253,8 +252,9 @@ scratch_cmux_tree(void *v_stream, uint32_t gpu_index, int8_t **cmux_tree_buffer,
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
int memory_needed_per_block = get_memory_needed_per_block_cmux_tree<Torus>(
glwe_dimension, polynomial_size);
uint64_t memory_needed_per_block =
get_memory_needed_per_block_cmux_tree<Torus>(glwe_dimension,
polynomial_size);
if (max_shared_memory >= memory_needed_per_block) {
check_cuda_error(cudaFuncSetAttribute(
device_batch_cmux<Torus, STorus, params, FULLSM>,
@@ -265,7 +265,7 @@ scratch_cmux_tree(void *v_stream, uint32_t gpu_index, int8_t **cmux_tree_buffer,
}
if (allocate_gpu_memory) {
int buffer_size = get_buffer_size_cmux_tree<Torus>(
uint64_t buffer_size = get_buffer_size_cmux_tree<Torus>(
glwe_dimension, polynomial_size, level_count, r, tau,
max_shared_memory);
*cmux_tree_buffer =
@@ -308,8 +308,9 @@ host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
return;
}
int memory_needed_per_block = get_memory_needed_per_block_cmux_tree<Torus>(
glwe_dimension, polynomial_size);
uint64_t memory_needed_per_block =
get_memory_needed_per_block_cmux_tree<Torus>(glwe_dimension,
polynomial_size);
dim3 thds(polynomial_size / params::opt, 1, 1);
@@ -467,7 +468,7 @@ __global__ void device_blind_rotation_and_sample_extraction(
}
template <typename Torus>
__host__ __device__ int
__host__ __device__ uint64_t
get_memory_needed_per_block_blind_rotation_sample_extraction(
uint32_t glwe_dimension, uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size *
@@ -479,14 +480,14 @@ get_memory_needed_per_block_blind_rotation_sample_extraction(
}
template <typename Torus>
__host__ __device__ int get_buffer_size_blind_rotation_sample_extraction(
__host__ __device__ uint64_t get_buffer_size_blind_rotation_sample_extraction(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory) {
int memory_needed_per_block =
uint64_t memory_needed_per_block =
get_memory_needed_per_block_blind_rotation_sample_extraction<Torus>(
glwe_dimension, polynomial_size);
int device_mem = 0;
uint64_t device_mem = 0;
if (max_shared_memory < memory_needed_per_block) {
device_mem = memory_needed_per_block * tau;
}
@@ -495,8 +496,8 @@ __host__ __device__ int get_buffer_size_blind_rotation_sample_extraction(
}
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
int buffer_size = mbr_size * ggsw_size * sizeof(double) // d_ggsw_fft_in
+ device_mem;
uint64_t buffer_size = mbr_size * ggsw_size * sizeof(double) // d_ggsw_fft_in
+ device_mem;
return buffer_size + buffer_size % sizeof(double2);
}
@@ -509,7 +510,7 @@ __host__ void scratch_blind_rotation_sample_extraction(
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
int memory_needed_per_block =
uint64_t memory_needed_per_block =
get_memory_needed_per_block_blind_rotation_sample_extraction<Torus>(
glwe_dimension, polynomial_size);
if (max_shared_memory >= memory_needed_per_block) {
@@ -524,9 +525,10 @@ __host__ void scratch_blind_rotation_sample_extraction(
}
if (allocate_gpu_memory) {
int buffer_size = get_buffer_size_blind_rotation_sample_extraction<Torus>(
glwe_dimension, polynomial_size, level_count, mbr_size, tau,
max_shared_memory);
uint64_t buffer_size =
get_buffer_size_blind_rotation_sample_extraction<Torus>(
glwe_dimension, polynomial_size, level_count, mbr_size, tau,
max_shared_memory);
*br_se_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
check_cuda_error(cudaGetLastError());
}
@@ -542,7 +544,7 @@ __host__ void host_blind_rotate_and_sample_extraction(
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
int memory_needed_per_block =
uint64_t memory_needed_per_block =
get_memory_needed_per_block_blind_rotation_sample_extraction<Torus>(
glwe_dimension, polynomial_size);

View File

@@ -27,14 +27,15 @@ __global__ void device_build_lut(Torus *lut_out, Torus *lut_in,
}
template <typename Torus>
__host__ __device__ int
get_buffer_size_cbs_vp(uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count_cbs, uint32_t tau,
uint32_t number_of_inputs) {
__host__ __device__ uint64_t get_buffer_size_cbs_vp(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count_cbs,
uint32_t tau,
uint32_t number_of_inputs) {
int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension + 1) * polynomial_size;
int buffer_size =
uint64_t buffer_size =
number_of_inputs * level_count_cbs * sizeof(Torus) + // lut_vector_indexes
number_of_inputs * ggsw_size * sizeof(Torus) + // ggsw_out_cbs
tau * (glwe_dimension + 1) * polynomial_size *
@@ -58,7 +59,7 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
(Torus *)malloc(number_of_inputs * level_count_cbs * sizeof(Torus));
uint32_t r = number_of_inputs - params::log2_degree;
uint32_t mbr_size = number_of_inputs - r;
int buffer_size =
uint64_t buffer_size =
get_buffer_size_cbs_vp<Torus>(glwe_dimension, polynomial_size,
level_count_cbs, tau, number_of_inputs) +
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension, polynomial_size,
@@ -83,7 +84,7 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
h_lut_vector_indexes[index] = index % level_count_cbs;
}
// lut_vector_indexes is the last buffer in the cbs_vp_buffer
int lut_vector_indexes_size =
uint64_t lut_vector_indexes_size =
number_of_inputs * level_count_cbs * sizeof(Torus);
int8_t *d_lut_vector_indexes =
(int8_t *)*cbs_vp_buffer +
@@ -192,13 +193,13 @@ __host__ void host_circuit_bootstrap_vertical_packing(
}
template <typename Torus>
__host__ __device__ int
__host__ __device__ uint64_t
get_buffer_size_wop_pbs(uint32_t lwe_dimension,
uint32_t number_of_bits_of_message_including_padding) {
int buffer_size = (lwe_dimension + 1) *
(number_of_bits_of_message_including_padding) *
sizeof(Torus); // lwe_array_out_bit_extract
uint64_t buffer_size = (lwe_dimension + 1) *
(number_of_bits_of_message_including_padding) *
sizeof(Torus); // lwe_array_out_bit_extract
return buffer_size + buffer_size % sizeof(double2);
}
@@ -216,7 +217,7 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
int bit_extract_buffer_size =
uint64_t bit_extract_buffer_size =
get_buffer_size_extract_bits<Torus>(glwe_dimension, lwe_dimension,
polynomial_size, number_of_inputs) +
get_buffer_size_bootstrap_low_latency<Torus>(
@@ -228,7 +229,7 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
uint32_t r = cbs_vp_number_of_inputs - params::log2_degree;
uint32_t mbr_size = cbs_vp_number_of_inputs - r;
if (allocate_gpu_memory) {
int buffer_size =
uint64_t buffer_size =
bit_extract_buffer_size +
get_buffer_size_wop_pbs<Torus>(
lwe_dimension, number_of_bits_of_message_including_padding) +