From e9243bce6f03762d4947aab516bad86af90df997 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Thu, 9 Feb 2023 10:28:19 +0100 Subject: [PATCH] refactor(cuda): change lut_vector_indexes type to Torus --- src/bit_extraction.cu | 10 +++++----- src/bit_extraction.cuh | 2 +- src/bootstrap_amortized.cu | 10 +++++----- src/bootstrap_amortized.cuh | 8 ++++---- src/bootstrap_low_latency.cu | 10 +++++----- src/bootstrap_low_latency.cuh | 2 +- src/circuit_bootstrap.cu | 10 +++++----- src/circuit_bootstrap.cuh | 2 +- src/wop_bootstrap.cuh | 18 +++++++++--------- 9 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/bit_extraction.cu b/src/bit_extraction.cu index 6b0fe8b53..7a0345837 100644 --- a/src/bit_extraction.cu +++ b/src/bit_extraction.cu @@ -180,7 +180,7 @@ void cuda_extract_bits_64( (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lwe_array_out_ks_buffer, (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, + (uint64_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples, max_shared_memory); @@ -192,7 +192,7 @@ void cuda_extract_bits_64( (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lwe_array_out_ks_buffer, (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, + (uint64_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples, max_shared_memory); @@ -204,7 +204,7 @@ void cuda_extract_bits_64( (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lwe_array_out_ks_buffer, (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, + (uint64_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples, max_shared_memory); @@ -216,7 +216,7 @@ void cuda_extract_bits_64( (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lwe_array_out_ks_buffer, (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, + (uint64_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples, max_shared_memory); @@ -228,7 +228,7 @@ void cuda_extract_bits_64( (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lwe_array_out_ks_buffer, (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, + (uint64_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples, max_shared_memory); diff --git a/src/bit_extraction.cuh b/src/bit_extraction.cuh index 4bb0aa9a8..3ea5b3216 100644 --- a/src/bit_extraction.cuh +++ b/src/bit_extraction.cuh @@ -138,7 +138,7 @@ __host__ void host_extract_bits( Torus *lwe_array_in, Torus *lwe_array_in_buffer, Torus *lwe_array_in_shifted_buffer, Torus *lwe_array_out_ks_buffer, Torus *lwe_array_out_pbs_buffer, Torus *lut_pbs, - uint32_t *lut_vector_indexes, Torus *ksk, double2 *fourier_bsk, + Torus *lut_vector_indexes, Torus *ksk, double2 *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk, diff --git a/src/bootstrap_amortized.cu b/src/bootstrap_amortized.cu index ead137428..61e94ca94 100644 --- a/src/bootstrap_amortized.cu +++ b/src/bootstrap_amortized.cu @@ -149,7 +149,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( case 512: host_bootstrap_amortized>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); @@ -157,7 +157,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( case 1024: host_bootstrap_amortized>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); @@ -165,7 +165,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( case 2048: host_bootstrap_amortized>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); @@ -173,7 +173,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( case 4096: host_bootstrap_amortized>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); @@ -181,7 +181,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( case 8192: host_bootstrap_amortized>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh index 4e13dc1c4..6162acb6b 100644 --- a/src/bootstrap_amortized.cuh +++ b/src/bootstrap_amortized.cuh @@ -51,7 +51,7 @@ template * is not FULLSM */ __global__ void device_bootstrap_amortized( - Torus *lwe_array_out, Torus *lut_vector, uint32_t *lut_vector_indexes, + Torus *lwe_array_out, Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key, char *device_mem, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t lwe_idx, @@ -216,9 +216,9 @@ __global__ void device_bootstrap_amortized( template __host__ void host_bootstrap_amortized( void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector, - uint32_t *lut_vector_indexes, Torus *lwe_array_in, - double2 *bootstrapping_key, uint32_t glwe_dimension, uint32_t lwe_dimension, - uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key, + uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) { diff --git a/src/bootstrap_low_latency.cu b/src/bootstrap_low_latency.cu index 8651b2456..662f29779 100644 --- a/src/bootstrap_low_latency.cu +++ b/src/bootstrap_low_latency.cu @@ -183,7 +183,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( case 512: host_bootstrap_low_latency>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, num_lut_vectors, max_shared_memory); @@ -191,7 +191,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( case 1024: host_bootstrap_low_latency>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, num_lut_vectors, max_shared_memory); @@ -199,7 +199,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( case 2048: host_bootstrap_low_latency>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, num_lut_vectors, max_shared_memory); @@ -207,7 +207,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( case 4096: host_bootstrap_low_latency>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, num_lut_vectors, max_shared_memory); @@ -215,7 +215,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( case 8192: host_bootstrap_low_latency>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, num_lut_vectors, max_shared_memory); diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index ac24ba474..5b0043944 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -248,7 +248,7 @@ __global__ void device_bootstrap_low_latency( template __host__ void host_bootstrap_low_latency( void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector, - uint32_t *lut_vector_indexes, Torus *lwe_array_in, + Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors, diff --git a/src/circuit_bootstrap.cu b/src/circuit_bootstrap.cu index 417084a9c..4c7ac4953 100644 --- a/src/circuit_bootstrap.cu +++ b/src/circuit_bootstrap.cu @@ -158,7 +158,7 @@ void cuda_circuit_bootstrap_64( v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, base_log_pksk, level_cbs, base_log_cbs, number_of_samples, @@ -169,7 +169,7 @@ void cuda_circuit_bootstrap_64( v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, base_log_pksk, level_cbs, base_log_cbs, number_of_samples, @@ -180,7 +180,7 @@ void cuda_circuit_bootstrap_64( v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, base_log_pksk, level_cbs, base_log_cbs, number_of_samples, @@ -191,7 +191,7 @@ void cuda_circuit_bootstrap_64( v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, base_log_pksk, level_cbs, base_log_cbs, number_of_samples, @@ -202,7 +202,7 @@ void cuda_circuit_bootstrap_64( v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, base_log_pksk, level_cbs, base_log_cbs, number_of_samples, diff --git a/src/circuit_bootstrap.cuh b/src/circuit_bootstrap.cuh index 9bb713000..3bcd38b84 100644 --- a/src/circuit_bootstrap.cuh +++ b/src/circuit_bootstrap.cuh @@ -106,7 +106,7 @@ __host__ void host_circuit_bootstrap( void *v_stream, uint32_t gpu_index, Torus *ggsw_out, Torus *lwe_array_in, double2 *fourier_bsk, Torus *fp_ksk_array, Torus *lwe_array_in_shifted_buffer, Torus *lut_vector, - uint32_t *lut_vector_indexes, Torus *lwe_array_out_pbs_buffer, + Torus *lut_vector_indexes, Torus *lwe_array_out_pbs_buffer, Torus *lwe_array_in_fp_ks_buffer, uint32_t delta_log, uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk, diff --git a/src/wop_bootstrap.cuh b/src/wop_bootstrap.cuh index 1760cad22..3bf7834c6 100644 --- a/src/wop_bootstrap.cuh +++ b/src/wop_bootstrap.cuh @@ -67,16 +67,16 @@ __host__ void host_circuit_bootstrap_vertical_packing( level_count_cbs * (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index); // indexes of lut vectors for cbs - uint32_t *h_lut_vector_indexes = - (uint32_t *)malloc(number_of_inputs * level_count_cbs * sizeof(uint32_t)); + Torus *h_lut_vector_indexes = + (Torus *)malloc(number_of_inputs * level_count_cbs * sizeof(Torus)); for (uint index = 0; index < level_count_cbs * number_of_inputs; index++) { h_lut_vector_indexes[index] = index % level_count_cbs; } - uint32_t *lut_vector_indexes = (uint32_t *)cuda_malloc_async( - number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index); + Torus *lut_vector_indexes = (Torus *)cuda_malloc_async( + number_of_inputs * level_count_cbs * sizeof(Torus), stream, gpu_index); cuda_memcpy_async_to_gpu( lut_vector_indexes, h_lut_vector_indexes, - number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index); + number_of_inputs * level_count_cbs * sizeof(Torus), stream, gpu_index); check_cuda_error(cudaGetLastError()); uint32_t bits = sizeof(Torus) * 8; @@ -145,12 +145,12 @@ __host__ void host_wop_pbs( // let mut h_lut_vector_indexes = vec![0 as u32; 1]; // indexes of lut vectors for bit extract - uint32_t *h_lut_vector_indexes = (uint32_t *)malloc(sizeof(uint32_t)); + Torus *h_lut_vector_indexes = (Torus *)malloc(sizeof(Torus)); h_lut_vector_indexes[0] = 0; - uint32_t *lut_vector_indexes = - (uint32_t *)cuda_malloc_async(sizeof(uint32_t), stream, gpu_index); + Torus *lut_vector_indexes = + (Torus *)cuda_malloc_async(sizeof(Torus), stream, gpu_index); cuda_memcpy_async_to_gpu(lut_vector_indexes, h_lut_vector_indexes, - sizeof(uint32_t), stream, gpu_index); + sizeof(Torus), stream, gpu_index); check_cuda_error(cudaGetLastError()); Torus *lut_pbs = (Torus *)cuda_malloc_async( (2 * polynomial_size) * sizeof(Torus), stream, gpu_index);