Compare commits

...

1 Commits

Author SHA1 Message Date
Agnes Leroy
7b0989cebc Revert "chore(gpu): remove some host decoration and duplicated def"
This reverts commit a26e68c3bc.
2024-10-02 11:31:29 +02:00
9 changed files with 87 additions and 41 deletions

View File

@@ -84,26 +84,28 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
}
template <typename Torus>
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_two(
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_step_two(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
uint64_t
__host__ __device__ uint64_t
get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
uint64_t
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
@@ -111,19 +113,21 @@ get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
}
template <typename Torus>
uint64_t get_buffer_size_partial_sm_programmable_bootstrap_tbc(
__host__ __device__ uint64_t
get_buffer_size_partial_sm_programmable_bootstrap_tbc(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus>
uint64_t get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // tbc
}
template <typename Torus>
uint64_t
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
@@ -131,13 +135,14 @@ get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
}
template <typename Torus>
uint64_t
__host__ __device__ uint64_t
get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus>
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
__host__ bool
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
@@ -293,7 +298,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
};
template <typename Torus>
uint64_t get_buffer_size_programmable_bootstrap_cg(
__host__ uint64_t get_buffer_size_programmable_bootstrap_cg(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
int max_shared_memory = cuda_get_max_shared_memory(0);

View File

@@ -35,7 +35,8 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
}
template <typename Torus>
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
__host__ bool
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
@@ -94,31 +95,40 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
uint32_t lut_count, uint32_t lut_stride);
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
__host__ __device__ uint64_t
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
@@ -289,7 +299,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
};
template <typename Torus, class params>
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size);
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size);
#endif // CUDA_MULTI_BIT_H

View File

@@ -213,7 +213,8 @@ __global__ void device_programmable_bootstrap_amortized(
}
template <typename Torus>
uint64_t get_buffer_size_full_sm_programmable_bootstrap_amortized(
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_amortized(
uint32_t polynomial_size, uint32_t glwe_dimension) {
return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
sizeof(Torus) * polynomial_size *
@@ -224,13 +225,14 @@ uint64_t get_buffer_size_full_sm_programmable_bootstrap_amortized(
}
template <typename Torus>
uint64_t get_buffer_size_partial_sm_programmable_bootstrap_amortized(
__host__ __device__ uint64_t
get_buffer_size_partial_sm_programmable_bootstrap_amortized(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
uint64_t get_buffer_size_programmable_bootstrap_amortized(
__host__ uint64_t get_buffer_size_programmable_bootstrap_amortized(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count) {

View File

@@ -177,18 +177,20 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
template <typename Torus>
uint64_t get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // accumulator
}
template <typename Torus>
uint64_t get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size * 2; // accumulator
}
template <typename Torus>
uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
__host__ __device__ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t grouping_factor, uint32_t lwe_chunk_size) {

View File

@@ -261,7 +261,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
template <typename Torus>
uint64_t get_buffer_size_programmable_bootstrap(
__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {

View File

@@ -432,8 +432,8 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
*/
template <typename Torus, class params>
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size) {
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size) {
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(

View File

@@ -362,26 +362,48 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
}
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator
}
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size * 2; // accumulator
}
template <typename Torus>
uint64_t get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
__host__ __device__ uint64_t
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // accumulator
}
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_multibit_programmable_bootstrap(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size) {
uint64_t buffer_size = 0;
buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
(glwe_dimension + 1) * (glwe_dimension + 1) *
(polynomial_size / 2) * sizeof(double2); // keybundle fft
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
level_count * (polynomial_size / 2) *
sizeof(double2); // global_accumulator_fft
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
polynomial_size * sizeof(Torus); // global_accumulator
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename params>
__host__ void scratch_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index,

View File

@@ -341,7 +341,8 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
}
template <typename Torus>
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
__host__ bool
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size) {
uint64_t minimum_sm =
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(

View File

@@ -180,18 +180,21 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
template <typename Torus>
uint64_t get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // distributed shared memory
}
template <typename Torus>
uint64_t get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // accumulator
}
template <typename Torus>
uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size * 2; // accumulator
}
@@ -345,7 +348,8 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
}
template <typename Torus>
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
__host__ bool
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
uint64_t minimum_sm =
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
@@ -438,7 +442,7 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
return cluster_size >= level_count * (glwe_dimension + 1);
}
template bool
template __host__ bool
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<uint64_t>(
uint32_t polynomial_size);
#endif // FASTMULTIBIT_PBS_H