mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 07:38:08 -05:00
Compare commits
1 Commits
go/refacto
...
al/div_per
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7b0989cebc |
@@ -84,26 +84,28 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_one(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_two(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_two(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
@@ -111,19 +113,21 @@ get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_partial_sm_programmable_bootstrap_tbc(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_tbc(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // tbc
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
@@ -131,13 +135,14 @@ get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
|
||||
@@ -293,7 +298,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
};
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_programmable_bootstrap_cg(
|
||||
__host__ uint64_t get_buffer_size_programmable_bootstrap_cg(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
|
||||
@@ -35,7 +35,8 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus>
|
||||
@@ -94,31 +95,40 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
uint32_t lut_count, uint32_t lut_stride);
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
@@ -289,7 +299,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
};
|
||||
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size);
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
#endif // CUDA_MULTI_BIT_H
|
||||
|
||||
@@ -213,7 +213,8 @@ __global__ void device_programmable_bootstrap_amortized(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_programmable_bootstrap_amortized(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_amortized(
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension) {
|
||||
return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
|
||||
sizeof(Torus) * polynomial_size *
|
||||
@@ -224,13 +225,14 @@ uint64_t get_buffer_size_full_sm_programmable_bootstrap_amortized(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_partial_sm_programmable_bootstrap_amortized(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_amortized(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_programmable_bootstrap_amortized(
|
||||
__host__ uint64_t get_buffer_size_programmable_bootstrap_amortized(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
|
||||
@@ -177,18 +177,20 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // accumulator
|
||||
}
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size * 2; // accumulator
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
|
||||
__host__ __device__ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t grouping_factor, uint32_t lwe_chunk_size) {
|
||||
|
||||
@@ -261,7 +261,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_programmable_bootstrap(
|
||||
__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
|
||||
@@ -432,8 +432,8 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size) {
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size) {
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
|
||||
@@ -362,26 +362,48 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
}
|
||||
}
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator
|
||||
}
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size * 2; // accumulator
|
||||
}
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // accumulator
|
||||
}
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // accumulator
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_multibit_programmable_bootstrap(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size) {
|
||||
|
||||
uint64_t buffer_size = 0;
|
||||
buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2) * sizeof(double2); // keybundle fft
|
||||
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
|
||||
level_count * (polynomial_size / 2) *
|
||||
sizeof(double2); // global_accumulator_fft
|
||||
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
|
||||
polynomial_size * sizeof(Torus); // global_accumulator
|
||||
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus, typename params>
|
||||
__host__ void scratch_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
|
||||
@@ -341,7 +341,8 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
uint64_t minimum_sm =
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
|
||||
|
||||
@@ -180,18 +180,21 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // distributed shared memory
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // accumulator
|
||||
}
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size * 2; // accumulator
|
||||
}
|
||||
@@ -345,7 +348,8 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
uint64_t minimum_sm =
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
@@ -438,7 +442,7 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
|
||||
return cluster_size >= level_count * (glwe_dimension + 1);
|
||||
}
|
||||
|
||||
template bool
|
||||
template __host__ bool
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<uint64_t>(
|
||||
uint32_t polynomial_size);
|
||||
#endif // FASTMULTIBIT_PBS_H
|
||||
|
||||
Reference in New Issue
Block a user