mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 03:25:05 -05:00
fix(concrete_cuda): fix buffer size type in the scratches, int is not big enough
This commit is contained in:
@@ -130,12 +130,11 @@ __global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
get_buffer_size_extract_bits(uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t number_of_inputs) {
|
||||
__host__ __device__ uint64_t get_buffer_size_extract_bits(
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t number_of_inputs) {
|
||||
|
||||
int buffer_size =
|
||||
uint64_t buffer_size =
|
||||
sizeof(Torus) * number_of_inputs // lut_vector_indexes
|
||||
+ ((glwe_dimension + 1) * polynomial_size) * sizeof(Torus) // lut_pbs
|
||||
+ (glwe_dimension * polynomial_size + 1) *
|
||||
@@ -159,7 +158,7 @@ scratch_extract_bits(void *v_stream, uint32_t gpu_index,
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int buffer_size =
|
||||
uint64_t buffer_size =
|
||||
get_buffer_size_extract_bits<Torus>(glwe_dimension, lwe_dimension,
|
||||
polynomial_size, number_of_inputs) +
|
||||
get_buffer_size_bootstrap_low_latency<Torus>(
|
||||
|
||||
@@ -212,9 +212,8 @@ __global__ void device_bootstrap_amortized(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
get_buffer_size_full_sm_bootstrap_amortized(uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension) {
|
||||
__host__ __device__ uint64_t get_buffer_size_full_sm_bootstrap_amortized(
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension) {
|
||||
return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
|
||||
sizeof(Torus) * polynomial_size *
|
||||
(glwe_dimension + 1) + // accumulator rotated
|
||||
@@ -224,23 +223,23 @@ get_buffer_size_full_sm_bootstrap_amortized(uint32_t polynomial_size,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_bootstrap_amortized(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int get_buffer_size_bootstrap_amortized(
|
||||
__host__ __device__ uint64_t get_buffer_size_bootstrap_amortized(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
int full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
|
||||
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
int partial_sm =
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
|
||||
int partial_dm = full_sm - partial_sm;
|
||||
int full_dm = full_sm;
|
||||
int device_mem = 0;
|
||||
uint64_t partial_dm = full_sm - partial_sm;
|
||||
uint64_t full_dm = full_sm;
|
||||
uint64_t device_mem = 0;
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count;
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
@@ -260,9 +259,9 @@ __host__ void scratch_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
|
||||
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
int partial_sm =
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
@@ -279,7 +278,7 @@ __host__ void scratch_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
if (allocate_gpu_memory) {
|
||||
int buffer_size = get_buffer_size_bootstrap_amortized<Torus>(
|
||||
uint64_t buffer_size = get_buffer_size_bootstrap_amortized<Torus>(
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
@@ -297,15 +296,15 @@ __host__ void host_bootstrap_amortized(
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
int SM_FULL = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
|
||||
uint64_t SM_FULL = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
|
||||
int SM_PART =
|
||||
uint64_t SM_PART =
|
||||
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
|
||||
|
||||
int DM_PART = SM_FULL - SM_PART;
|
||||
uint64_t DM_PART = SM_FULL - SM_PART;
|
||||
|
||||
int DM_FULL = SM_FULL;
|
||||
uint64_t DM_FULL = SM_FULL;
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
|
||||
@@ -135,7 +135,7 @@ __global__ void device_bootstrap_low_latency(
|
||||
Torus *lwe_array_in, double2 *bootstrapping_key, double2 *join_buffer,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
int device_memory_size_per_block) {
|
||||
uint64_t device_memory_size_per_block) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
@@ -144,14 +144,15 @@ __global__ void device_bootstrap_low_latency(
|
||||
// much faster than global memory
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory;
|
||||
int block_index =
|
||||
blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
|
||||
uint32_t glwe_dimension = gridDim.y - 1;
|
||||
|
||||
if constexpr (SMD == FULLSM)
|
||||
if constexpr (SMD == FULLSM) {
|
||||
selected_memory = sharedmem;
|
||||
else
|
||||
} else {
|
||||
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
|
||||
blockIdx.z * gridDim.x * gridDim.y;
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
}
|
||||
|
||||
// We always compute the pointer with most restrictive alignment to avoid
|
||||
// alignment issues
|
||||
@@ -245,7 +246,7 @@ __global__ void device_bootstrap_low_latency(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_bootstrap_low_latency(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
@@ -253,23 +254,23 @@ get_buffer_size_full_sm_bootstrap_low_latency(uint32_t polynomial_size) {
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int get_buffer_size_bootstrap_low_latency(
|
||||
__host__ __device__ uint64_t get_buffer_size_bootstrap_low_latency(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
int full_sm =
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
int partial_sm =
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
int partial_dm = full_sm - partial_sm;
|
||||
int full_dm = full_sm;
|
||||
int device_mem = 0;
|
||||
uint64_t partial_dm = full_sm - partial_sm;
|
||||
uint64_t full_dm = full_sm;
|
||||
uint64_t device_mem = 0;
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
@@ -277,9 +278,9 @@ __host__ __device__ int get_buffer_size_bootstrap_low_latency(
|
||||
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
}
|
||||
int buffer_size = device_mem + (glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2);
|
||||
uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2);
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
@@ -292,9 +293,9 @@ __host__ void scratch_bootstrap_low_latency(
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int full_sm =
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
int partial_sm =
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
@@ -313,7 +314,7 @@ __host__ void scratch_bootstrap_low_latency(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
if (allocate_gpu_memory) {
|
||||
int buffer_size = get_buffer_size_bootstrap_low_latency<Torus>(
|
||||
uint64_t buffer_size = get_buffer_size_bootstrap_low_latency<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
@@ -339,15 +340,15 @@ __host__ void host_bootstrap_low_latency(
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
int full_sm =
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
|
||||
int partial_sm =
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
|
||||
int full_dm = full_sm;
|
||||
uint64_t full_dm = full_sm;
|
||||
|
||||
int partial_dm = full_dm - partial_sm;
|
||||
uint64_t partial_dm = full_dm - partial_sm;
|
||||
|
||||
int8_t *d_mem = pbs_buffer;
|
||||
double2 *buffer_fft =
|
||||
|
||||
@@ -101,21 +101,23 @@ __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
get_buffer_size_cbs(uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count_cbs,
|
||||
uint32_t number_of_inputs) {
|
||||
__host__ __device__ uint64_t get_buffer_size_cbs(uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count_cbs,
|
||||
uint32_t number_of_inputs) {
|
||||
|
||||
int buffer_size = number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
sizeof(Torus) + // lwe_array_in_fp_ks_buffer
|
||||
number_of_inputs * level_count_cbs *
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
sizeof(Torus) + // lwe_array_out_pbs_buffer
|
||||
number_of_inputs * level_count_cbs * (lwe_dimension + 1) *
|
||||
sizeof(Torus) + // lwe_array_in_shifted_buffer
|
||||
level_count_cbs * (glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(Torus); // lut_vector_cbs
|
||||
uint64_t buffer_size =
|
||||
number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
sizeof(Torus) + // lwe_array_in_fp_ks_buffer
|
||||
number_of_inputs * level_count_cbs *
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
sizeof(Torus) + // lwe_array_out_pbs_buffer
|
||||
number_of_inputs * level_count_cbs * (lwe_dimension + 1) *
|
||||
sizeof(Torus) + // lwe_array_in_shifted_buffer
|
||||
level_count_cbs * (glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(Torus); // lut_vector_cbs
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
@@ -132,7 +134,7 @@ __host__ void scratch_circuit_bootstrap(
|
||||
int pbs_count = number_of_inputs * level_count_cbs;
|
||||
// allocate and initialize device pointers for circuit bootstrap
|
||||
if (allocate_gpu_memory) {
|
||||
int buffer_size =
|
||||
uint64_t buffer_size =
|
||||
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs,
|
||||
number_of_inputs) +
|
||||
|
||||
@@ -210,9 +210,8 @@ __global__ void device_batch_cmux(Torus *glwe_array_out, Torus *glwe_array_in,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
get_memory_needed_per_block_cmux_tree(uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size) {
|
||||
__host__ __device__ uint64_t get_memory_needed_per_block_cmux_tree(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // glwe_sub
|
||||
sizeof(double2) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1) + // res_fft
|
||||
@@ -220,27 +219,27 @@ get_memory_needed_per_block_cmux_tree(uint32_t glwe_dimension,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
get_buffer_size_cmux_tree(uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t r, uint32_t tau,
|
||||
uint32_t max_shared_memory) {
|
||||
__host__ __device__ uint64_t get_buffer_size_cmux_tree(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t r, uint32_t tau, uint32_t max_shared_memory) {
|
||||
|
||||
int memory_needed_per_block = get_memory_needed_per_block_cmux_tree<Torus>(
|
||||
glwe_dimension, polynomial_size);
|
||||
int num_lut = (1 << r);
|
||||
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * level_count;
|
||||
int glwe_size = (glwe_dimension + 1) * polynomial_size;
|
||||
int device_mem = 0;
|
||||
uint64_t memory_needed_per_block =
|
||||
get_memory_needed_per_block_cmux_tree<Torus>(glwe_dimension,
|
||||
polynomial_size);
|
||||
uint64_t num_lut = (1 << r);
|
||||
uint64_t ggsw_size = polynomial_size * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * level_count;
|
||||
uint64_t glwe_size = (glwe_dimension + 1) * polynomial_size;
|
||||
uint64_t device_mem = 0;
|
||||
if (max_shared_memory < memory_needed_per_block) {
|
||||
device_mem = memory_needed_per_block * (1 << (r - 1)) * tau;
|
||||
}
|
||||
if (max_shared_memory < polynomial_size * sizeof(double)) {
|
||||
device_mem += polynomial_size * sizeof(double);
|
||||
}
|
||||
int buffer_size = r * ggsw_size * sizeof(double) +
|
||||
num_lut * tau * glwe_size * sizeof(Torus) +
|
||||
num_lut * tau * glwe_size * sizeof(Torus) + device_mem;
|
||||
uint64_t buffer_size = r * ggsw_size * sizeof(double) +
|
||||
num_lut * tau * glwe_size * sizeof(Torus) +
|
||||
num_lut * tau * glwe_size * sizeof(Torus) + device_mem;
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
@@ -253,8 +252,9 @@ scratch_cmux_tree(void *v_stream, uint32_t gpu_index, int8_t **cmux_tree_buffer,
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int memory_needed_per_block = get_memory_needed_per_block_cmux_tree<Torus>(
|
||||
glwe_dimension, polynomial_size);
|
||||
uint64_t memory_needed_per_block =
|
||||
get_memory_needed_per_block_cmux_tree<Torus>(glwe_dimension,
|
||||
polynomial_size);
|
||||
if (max_shared_memory >= memory_needed_per_block) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_batch_cmux<Torus, STorus, params, FULLSM>,
|
||||
@@ -265,7 +265,7 @@ scratch_cmux_tree(void *v_stream, uint32_t gpu_index, int8_t **cmux_tree_buffer,
|
||||
}
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
int buffer_size = get_buffer_size_cmux_tree<Torus>(
|
||||
uint64_t buffer_size = get_buffer_size_cmux_tree<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count, r, tau,
|
||||
max_shared_memory);
|
||||
*cmux_tree_buffer =
|
||||
@@ -308,8 +308,9 @@ host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
|
||||
return;
|
||||
}
|
||||
|
||||
int memory_needed_per_block = get_memory_needed_per_block_cmux_tree<Torus>(
|
||||
glwe_dimension, polynomial_size);
|
||||
uint64_t memory_needed_per_block =
|
||||
get_memory_needed_per_block_cmux_tree<Torus>(glwe_dimension,
|
||||
polynomial_size);
|
||||
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
@@ -467,7 +468,7 @@ __global__ void device_blind_rotation_and_sample_extraction(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
__host__ __device__ uint64_t
|
||||
get_memory_needed_per_block_blind_rotation_sample_extraction(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size *
|
||||
@@ -479,14 +480,14 @@ get_memory_needed_per_block_blind_rotation_sample_extraction(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int get_buffer_size_blind_rotation_sample_extraction(
|
||||
__host__ __device__ uint64_t get_buffer_size_blind_rotation_sample_extraction(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory) {
|
||||
|
||||
int memory_needed_per_block =
|
||||
uint64_t memory_needed_per_block =
|
||||
get_memory_needed_per_block_blind_rotation_sample_extraction<Torus>(
|
||||
glwe_dimension, polynomial_size);
|
||||
int device_mem = 0;
|
||||
uint64_t device_mem = 0;
|
||||
if (max_shared_memory < memory_needed_per_block) {
|
||||
device_mem = memory_needed_per_block * tau;
|
||||
}
|
||||
@@ -495,8 +496,8 @@ __host__ __device__ int get_buffer_size_blind_rotation_sample_extraction(
|
||||
}
|
||||
int ggsw_size = polynomial_size * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * level_count;
|
||||
int buffer_size = mbr_size * ggsw_size * sizeof(double) // d_ggsw_fft_in
|
||||
+ device_mem;
|
||||
uint64_t buffer_size = mbr_size * ggsw_size * sizeof(double) // d_ggsw_fft_in
|
||||
+ device_mem;
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
@@ -509,7 +510,7 @@ __host__ void scratch_blind_rotation_sample_extraction(
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int memory_needed_per_block =
|
||||
uint64_t memory_needed_per_block =
|
||||
get_memory_needed_per_block_blind_rotation_sample_extraction<Torus>(
|
||||
glwe_dimension, polynomial_size);
|
||||
if (max_shared_memory >= memory_needed_per_block) {
|
||||
@@ -524,9 +525,10 @@ __host__ void scratch_blind_rotation_sample_extraction(
|
||||
}
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
int buffer_size = get_buffer_size_blind_rotation_sample_extraction<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count, mbr_size, tau,
|
||||
max_shared_memory);
|
||||
uint64_t buffer_size =
|
||||
get_buffer_size_blind_rotation_sample_extraction<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count, mbr_size, tau,
|
||||
max_shared_memory);
|
||||
*br_se_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -542,7 +544,7 @@ __host__ void host_blind_rotate_and_sample_extraction(
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int memory_needed_per_block =
|
||||
uint64_t memory_needed_per_block =
|
||||
get_memory_needed_per_block_blind_rotation_sample_extraction<Torus>(
|
||||
glwe_dimension, polynomial_size);
|
||||
|
||||
|
||||
@@ -27,14 +27,15 @@ __global__ void device_build_lut(Torus *lut_out, Torus *lut_in,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
get_buffer_size_cbs_vp(uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count_cbs, uint32_t tau,
|
||||
uint32_t number_of_inputs) {
|
||||
__host__ __device__ uint64_t get_buffer_size_cbs_vp(uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count_cbs,
|
||||
uint32_t tau,
|
||||
uint32_t number_of_inputs) {
|
||||
|
||||
int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * polynomial_size;
|
||||
int buffer_size =
|
||||
uint64_t buffer_size =
|
||||
number_of_inputs * level_count_cbs * sizeof(Torus) + // lut_vector_indexes
|
||||
number_of_inputs * ggsw_size * sizeof(Torus) + // ggsw_out_cbs
|
||||
tau * (glwe_dimension + 1) * polynomial_size *
|
||||
@@ -58,7 +59,7 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
|
||||
(Torus *)malloc(number_of_inputs * level_count_cbs * sizeof(Torus));
|
||||
uint32_t r = number_of_inputs - params::log2_degree;
|
||||
uint32_t mbr_size = number_of_inputs - r;
|
||||
int buffer_size =
|
||||
uint64_t buffer_size =
|
||||
get_buffer_size_cbs_vp<Torus>(glwe_dimension, polynomial_size,
|
||||
level_count_cbs, tau, number_of_inputs) +
|
||||
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension, polynomial_size,
|
||||
@@ -83,7 +84,7 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
|
||||
h_lut_vector_indexes[index] = index % level_count_cbs;
|
||||
}
|
||||
// lut_vector_indexes is the last buffer in the cbs_vp_buffer
|
||||
int lut_vector_indexes_size =
|
||||
uint64_t lut_vector_indexes_size =
|
||||
number_of_inputs * level_count_cbs * sizeof(Torus);
|
||||
int8_t *d_lut_vector_indexes =
|
||||
(int8_t *)*cbs_vp_buffer +
|
||||
@@ -192,13 +193,13 @@ __host__ void host_circuit_bootstrap_vertical_packing(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_wop_pbs(uint32_t lwe_dimension,
|
||||
uint32_t number_of_bits_of_message_including_padding) {
|
||||
|
||||
int buffer_size = (lwe_dimension + 1) *
|
||||
(number_of_bits_of_message_including_padding) *
|
||||
sizeof(Torus); // lwe_array_out_bit_extract
|
||||
uint64_t buffer_size = (lwe_dimension + 1) *
|
||||
(number_of_bits_of_message_including_padding) *
|
||||
sizeof(Torus); // lwe_array_out_bit_extract
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
@@ -216,7 +217,7 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int bit_extract_buffer_size =
|
||||
uint64_t bit_extract_buffer_size =
|
||||
get_buffer_size_extract_bits<Torus>(glwe_dimension, lwe_dimension,
|
||||
polynomial_size, number_of_inputs) +
|
||||
get_buffer_size_bootstrap_low_latency<Torus>(
|
||||
@@ -228,7 +229,7 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
|
||||
uint32_t r = cbs_vp_number_of_inputs - params::log2_degree;
|
||||
uint32_t mbr_size = cbs_vp_number_of_inputs - r;
|
||||
if (allocate_gpu_memory) {
|
||||
int buffer_size =
|
||||
uint64_t buffer_size =
|
||||
bit_extract_buffer_size +
|
||||
get_buffer_size_wop_pbs<Torus>(
|
||||
lwe_dimension, number_of_bits_of_message_including_padding) +
|
||||
|
||||
Reference in New Issue
Block a user