diff --git a/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml b/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml index 633e4eb82..fdcb89082 100644 --- a/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml +++ b/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml @@ -46,8 +46,8 @@ jobs: github-token: ${{ secrets.SLAB_ACTION_TOKEN }} slab-url: ${{ secrets.SLAB_BASE_URL }} job-secret: ${{ secrets.JOB_SECRET }} - backend: aws - profile: multi-gpu-test + backend: hyperstack + profile: multi-h100-nvlink cuda-integer-multi-bit-multi-gpu-benchmarks: name: Execute multi GPU integer multi-bit benchmarks @@ -62,11 +62,23 @@ jobs: include: - os: ubuntu-22.04 cuda: "12.2" - gcc: 9 + gcc: 11 env: CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} - + CMAKE_VERSION: 3.29.6 steps: + # Mandatory on hyperstack since a bootable volume is not re-usable yet. + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + - name: Checkout tfhe-rs repo with tags uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 with: @@ -135,7 +147,7 @@ jobs: run: | python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \ --database tfhe_rs \ - --hardware "p3.8xlarge" \ + --hardware "n3-H100x8-nvlink" \ --backend gpu \ --project-version "${{ env.COMMIT_HASH }}" \ --branch ${{ github.ref_name }} \ diff --git a/.github/workflows/integer_multi_gpu_full_benchmark.yml b/.github/workflows/integer_multi_gpu_full_benchmark.yml index 40d56786e..13c5a0ae7 100644 --- a/.github/workflows/integer_multi_gpu_full_benchmark.yml +++ b/.github/workflows/integer_multi_gpu_full_benchmark.yml @@ -35,8 +35,8 @@ jobs: github-token: ${{ secrets.SLAB_ACTION_TOKEN }} slab-url: ${{ secrets.SLAB_BASE_URL }} job-secret: ${{ secrets.JOB_SECRET }} - backend: aws - profile: multi-gpu-test + backend: hyperstack + profile: multi-h100-nvlink cuda-integer-full-multi-gpu-benchmarks: name: Execute multi GPU integer benchmarks for all operations flavor @@ -54,11 +54,23 @@ jobs: include: - os: ubuntu-22.04 cuda: "12.2" - gcc: 9 + gcc: 11 env: CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} - + CMAKE_VERSION: 3.29.6 steps: + # Mandatory on hyperstack since a bootable volume is not re-usable yet. + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + - name: Checkout tfhe-rs repo with tags uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 with: @@ -117,7 +129,7 @@ jobs: run: | python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \ --database tfhe_rs \ - --hardware "p3.8xlarge" \ + --hardware "n3-H100x8-nvlink" \ --backend gpu \ --project-version "${{ env.COMMIT_HASH }}" \ --branch ${{ github.ref_name }} \ diff --git a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h index e928263b9..cbdd17fbf 100644 --- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h +++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h @@ -1,6 +1,8 @@ #ifndef HELPER_MULTI_GPU_H #define HELPER_MULTI_GPU_H #include +#include +#include extern std::mutex m; extern bool p2p_enabled; @@ -9,6 +11,20 @@ extern "C" { int cuda_setup_multi_gpu(); } +// Define a variant type that can be either a vector or a single pointer +template +using LweArrayVariant = std::variant, Torus *>; + +// Macro to define the visitor logic using std::holds_alternative for vectors +#define GET_VARIANT_ELEMENT(variant, index) \ + [&] { \ + if (std::holds_alternative>(variant)) { \ + return std::get>(variant)[index]; \ + } else { \ + return std::get(variant); \ + } \ + }() + int get_active_gpu_count(int num_inputs, int gpu_count); int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count); diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index a55c3d03e..df4eb39df 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -487,11 +487,21 @@ template struct int_radix_lut { // for the moment Torus *lwe_indexes_in; Torus *lwe_indexes_out; + Torus *h_lwe_indexes_in; + Torus *h_lwe_indexes_out; + // Enable optimizations if lwe_indexes_(in/out) are trivial + bool using_trivial_lwe_indexes = true; // lwe_trivial_indexes is the intermediary index we need in case // lwe_indexes_in != lwe_indexes_out Torus *lwe_trivial_indexes; Torus *tmp_lwe_before_ks; - Torus *tmp_lwe_after_ks; + + /// For multi GPU execution we create vectors of pointers for inputs and + /// outputs + std::vector lwe_array_in_vec; + std::vector lwe_after_ks_vec; + std::vector lwe_after_pbs_vec; + std::vector lwe_trivial_indexes_vec; int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_luts, @@ -511,7 +521,7 @@ template struct int_radix_lut { cudaSetDevice(i); int8_t *gpu_pbs_buffer; auto num_blocks_on_gpu = - get_num_inputs_on_gpu(num_radix_blocks, i, gpu_count); + get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count); execute_scratch_pbs( streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension, @@ -551,20 +561,43 @@ template struct int_radix_lut { num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); lwe_trivial_indexes = (Torus *)cuda_malloc_async( num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); + + h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); + h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); for (int i = 0; i < num_radix_blocks; i++) - h_lwe_indexes[i] = i; + h_lwe_indexes_in[i] = i; - cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes, + cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes, + cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes, + cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); + memcpy(h_lwe_indexes_out, h_lwe_indexes_in, + num_radix_blocks * sizeof(Torus)); + + /// With multiple GPUs we allocate arrays to be pushed to the vectors and + /// copy data on each GPU then when we gather data to GPU 0 we can copy + /// back to the original indexing + multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count, + lwe_array_in_vec, num_radix_blocks, + params.big_lwe_dimension + 1); + multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count, + lwe_after_ks_vec, num_radix_blocks, + params.small_lwe_dimension + 1); + multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count, + lwe_after_pbs_vec, num_radix_blocks, + params.big_lwe_dimension + 1); + multi_gpu_alloc_array_async(streams, gpu_indexes, active_gpu_count, + lwe_trivial_indexes_vec, num_radix_blocks); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count, + lwe_trivial_indexes_vec, lwe_trivial_indexes, + num_radix_blocks); // Keyswitch Torus big_size = @@ -573,10 +606,6 @@ template struct int_radix_lut { (params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); tmp_lwe_before_ks = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]); - tmp_lwe_after_ks = - (Torus *)cuda_malloc_async(small_size, streams[0], gpu_indexes[0]); - cuda_synchronize_stream(streams[0], gpu_indexes[0]); - free(h_lwe_indexes); } } @@ -598,7 +627,14 @@ template struct int_radix_lut { buffer = base_lut_object->buffer; // Keyswitch tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks; - tmp_lwe_after_ks = base_lut_object->tmp_lwe_after_ks; + + /// With multiple GPUs we allocate arrays to be pushed to the vectors and + /// copy data on each GPU then when we gather data to GPU 0 we can copy back + /// to the original indexing + lwe_array_in_vec = base_lut_object->lwe_array_in_vec; + lwe_after_ks_vec = base_lut_object->lwe_after_ks_vec; + lwe_after_pbs_vec = base_lut_object->lwe_after_pbs_vec; + lwe_trivial_indexes_vec = base_lut_object->lwe_trivial_indexes_vec; mem_reuse = true; @@ -630,22 +666,24 @@ template struct int_radix_lut { num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); lwe_trivial_indexes = (Torus *)cuda_malloc_async( num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); + + h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); + h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); for (int i = 0; i < num_radix_blocks; i++) - h_lwe_indexes[i] = i; + h_lwe_indexes_in[i] = i; - cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes, + cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes, + cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes, + cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_synchronize_stream(streams[0], gpu_indexes[0]); - free(h_lwe_indexes); + memcpy(h_lwe_indexes_out, h_lwe_indexes_in, + num_radix_blocks * sizeof(Torus)); } // Return a pointer to idx-ith lut at gpu_index's global memory @@ -663,6 +701,22 @@ template struct int_radix_lut { return &lut_indexes[ind]; } + // If this function is called we assume the lwe_indexes_(in/out) are not the + // trivial anymore and thus we disable optimizations + void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index, + Torus *h_indexes_in, Torus *h_indexes_out) { + + memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(Torus)); + memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus)); + + cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in, + num_blocks * sizeof(Torus), stream, gpu_index); + cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_out, + num_blocks * sizeof(Torus), stream, gpu_index); + + using_trivial_lwe_indexes = false; + } + // Broadcast luts from gpu src_gpu_idx to all active gpus void broadcast_lut(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t src_gpu_idx) { @@ -672,7 +726,6 @@ template struct int_radix_lut { auto src_lut_indexes = lut_indexes_vec[src_gpu_idx]; cuda_synchronize_stream(streams[0], gpu_indexes[0]); -#pragma omp parallel for num_threads(active_gpu_count) for (uint i = 0; i < active_gpu_count; i++) { if (i != src_gpu_idx) { auto dst_lut = lut_vec[i]; @@ -690,7 +743,6 @@ template struct int_radix_lut { void release(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { -#pragma omp parallel for num_threads(active_gpu_count) for (uint i = 0; i < active_gpu_count; i++) { cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]); cuda_drop_async(lut_indexes_vec[i], streams[i], gpu_indexes[i]); @@ -701,9 +753,13 @@ template struct int_radix_lut { cuda_drop_async(lwe_indexes_in, streams[0], gpu_indexes[0]); cuda_drop_async(lwe_indexes_out, streams[0], gpu_indexes[0]); cuda_drop_async(lwe_trivial_indexes, streams[0], gpu_indexes[0]); + + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + free(h_lwe_indexes_in); + free(h_lwe_indexes_out); + if (!mem_reuse) { cuda_drop_async(tmp_lwe_before_ks, streams[0], gpu_indexes[0]); - cuda_drop_async(tmp_lwe_after_ks, streams[0], gpu_indexes[0]); cuda_synchronize_stream(streams[0], gpu_indexes[0]); for (int i = 0; i < buffer.size(); i++) { switch (params.pbs_type) { @@ -721,6 +777,17 @@ template struct int_radix_lut { cuda_synchronize_stream(streams[i], gpu_indexes[i]); } buffer.clear(); + + multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec); + multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec); + multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec); + multi_gpu_release_async(streams, gpu_indexes, lwe_trivial_indexes_vec); + for (uint i = 0; i < active_gpu_count; i++) + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + lwe_array_in_vec.clear(); + lwe_after_ks_vec.clear(); + lwe_after_pbs_vec.clear(); + lwe_trivial_indexes_vec.clear(); } } }; @@ -782,10 +849,6 @@ template struct int_bit_extract_luts_buffer { for (int i = 0; i < bits_per_block; i++) h_lwe_indexes_in[i + j * bits_per_block] = j; } - cuda_memcpy_async_to_gpu(lut->lwe_indexes_in, h_lwe_indexes_in, - num_radix_blocks * bits_per_block * - sizeof(Torus), - streams[0], gpu_indexes[0]); /** * the output should aim different lwe ciphertexts, so lwe_indexes_out = @@ -797,10 +860,9 @@ template struct int_bit_extract_luts_buffer { for (int i = 0; i < num_radix_blocks * bits_per_block; i++) h_lwe_indexes_out[i] = i; - cuda_memcpy_async_to_gpu(lut->lwe_indexes_out, h_lwe_indexes_out, - num_radix_blocks * bits_per_block * - sizeof(Torus), - streams[0], gpu_indexes[0]); + lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in, + h_lwe_indexes_out); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); free(h_lut_indexes); free(h_lwe_indexes_in); @@ -1666,6 +1728,7 @@ template struct int_arithmetic_scalar_shift_buffer { cudaStream_t *local_streams_1; cudaStream_t *local_streams_2; + uint32_t active_gpu_count; int_arithmetic_scalar_shift_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, @@ -1673,12 +1736,15 @@ template struct int_arithmetic_scalar_shift_buffer { int_radix_params params, uint32_t num_radix_blocks, bool allocate_gpu_memory) { + active_gpu_count = get_active_gpu_count(1, gpu_count); // In the arithmetic shift, a PBS has to be applied to the last rotated // block twice: once to shift it, once to compute the padding block to be // copied onto all blocks to the left of the last rotated block - local_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); - local_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); - for (uint j = 0; j < gpu_count; j++) { + local_streams_1 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + local_streams_2 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + for (uint j = 0; j < active_gpu_count; j++) { local_streams_1[j] = cuda_create_stream(gpu_indexes[j]); local_streams_2[j] = cuda_create_stream(gpu_indexes[j]); } @@ -1811,7 +1877,7 @@ template struct int_arithmetic_scalar_shift_buffer { void release(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < active_gpu_count; j++) { cuda_destroy_stream(local_streams_1[j], gpu_indexes[j]); cuda_destroy_stream(local_streams_2[j], gpu_indexes[j]); } @@ -1840,20 +1906,24 @@ template struct int_zero_out_if_buffer { cudaStream_t *true_streams; cudaStream_t *false_streams; + uint32_t active_gpu_count; int_zero_out_if_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; + active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); Torus big_size = (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); if (allocate_gpu_memory) { tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]); // We may use a different stream to allow concurrent operation - true_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); - false_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); - for (uint j = 0; j < gpu_count; j++) { + true_streams = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + false_streams = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + for (uint j = 0; j < active_gpu_count; j++) { true_streams[j] = cuda_create_stream(gpu_indexes[j]); false_streams[j] = cuda_create_stream(gpu_indexes[j]); } @@ -1862,7 +1932,7 @@ template struct int_zero_out_if_buffer { void release(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { cuda_drop_async(tmp, streams[0], gpu_indexes[0]); - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < active_gpu_count; j++) { cuda_destroy_stream(true_streams[j], gpu_indexes[j]); cuda_destroy_stream(false_streams[j], gpu_indexes[j]); } @@ -2295,6 +2365,7 @@ template struct int_comparison_buffer { int_radix_lut *signed_msb_lut; cudaStream_t *lsb_streams; cudaStream_t *msb_streams; + uint32_t active_gpu_count; int_comparison_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, COMPARISON_TYPE op, @@ -2304,14 +2375,18 @@ template struct int_comparison_buffer { this->op = op; this->is_signed = is_signed; + active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); + identity_lut_f = [](Torus x) -> Torus { return x; }; auto big_lwe_size = params.big_lwe_dimension + 1; if (allocate_gpu_memory) { - lsb_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); - msb_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); - for (uint j = 0; j < gpu_count; j++) { + lsb_streams = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + msb_streams = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + for (uint j = 0; j < active_gpu_count; j++) { lsb_streams[j] = cuda_create_stream(gpu_indexes[j]); msb_streams[j] = cuda_create_stream(gpu_indexes[j]); } @@ -2475,7 +2550,7 @@ template struct int_comparison_buffer { signed_msb_lut->release(streams, gpu_indexes, gpu_count); delete (signed_msb_lut); } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < active_gpu_count; j++) { cuda_destroy_stream(lsb_streams[j], gpu_indexes[j]); cuda_destroy_stream(msb_streams[j], gpu_indexes[j]); } @@ -2486,6 +2561,7 @@ template struct int_comparison_buffer { template struct int_div_rem_memory { int_radix_params params; + uint32_t active_gpu_count; // memory objects for other operations int_logical_scalar_shift_buffer *shift_mem_1; @@ -2721,6 +2797,8 @@ template struct int_div_rem_memory { int_div_rem_memory(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_blocks, bool allocate_gpu_memory) { + active_gpu_count = get_active_gpu_count(num_blocks, gpu_count); + this->params = params; shift_mem_1 = new int_logical_scalar_shift_buffer( streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, @@ -2740,11 +2818,15 @@ template struct int_div_rem_memory { init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks); init_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks); - sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); - sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); - sub_streams_3 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); - sub_streams_4 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); - for (uint j = 0; j < gpu_count; j++) { + sub_streams_1 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + sub_streams_2 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + sub_streams_3 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + sub_streams_4 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + for (uint j = 0; j < active_gpu_count; j++) { sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]); sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]); sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]); @@ -2815,7 +2897,7 @@ template struct int_div_rem_memory { delete[] merge_overflow_flags_luts; // release sub streams - for (uint i = 0; i < gpu_count; i++) { + for (uint i = 0; i < active_gpu_count; i++) { cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]); cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]); cuda_destroy_stream(sub_streams_3[i], gpu_indexes[i]); @@ -2992,12 +3074,12 @@ template struct int_resolve_signed_overflow_memory { template struct int_signed_overflowing_add_or_sub_memory { int_radix_params params; + uint32_t active_gpu_count; // memory objects for other operations int_sc_prop_memory *scp_mem; int_last_block_inner_propagate_memory *las_block_prop_mem; int_resolve_signed_overflow_memory *resolve_overflow_mem; - // lookupt tables // sub streams cudaStream_t *sub_streams_1; @@ -3036,13 +3118,17 @@ template struct int_signed_overflowing_add_or_sub_memory { int_radix_params params, uint32_t num_blocks, SIGNED_OPERATION op, bool allocate_gpu_memory) { this->params = params; + active_gpu_count = get_active_gpu_count(num_blocks, gpu_count); - allocate_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks); + allocate_temporary_buffers(streams, gpu_indexes, active_gpu_count, + num_blocks); // initialize streams - sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); - sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); - for (uint j = 0; j < gpu_count; j++) { + sub_streams_1 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + sub_streams_2 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + for (uint j = 0; j < active_gpu_count; j++) { sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]); sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]); } @@ -3078,7 +3164,7 @@ template struct int_signed_overflowing_add_or_sub_memory { cuda_drop_async(last_block_inner_propagation, streams[0], gpu_indexes[0]); // sub streams - for (uint i = 0; i < gpu_count; i++) { + for (uint i = 0; i < active_gpu_count; i++) { cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]); cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]); } diff --git a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h index 70297f50f..924ec131f 100644 --- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h +++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h @@ -9,15 +9,13 @@ void cuda_keyswitch_lwe_ciphertext_vector_32( void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t gpu_offset = 0); + uint32_t base_log, uint32_t level_count, uint32_t num_samples); void cuda_keyswitch_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t gpu_offset = 0); + uint32_t base_log, uint32_t level_count, uint32_t num_samples); } #endif // CNCRT_KS_H_ diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h index 260c037b6..0980ddee4 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h +++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h @@ -51,7 +51,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory, uint32_t gpu_offset = 0); + uint32_t max_shared_memory); void cleanup_cuda_programmable_bootstrap_amortized(void *stream, uint32_t gpu_index, @@ -76,7 +76,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory, uint32_t gpu_offset = 0); + uint32_t max_shared_memory); void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -85,7 +85,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory, uint32_t gpu_offset = 0); + uint32_t max_shared_memory); void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index, int8_t **pbs_buffer); @@ -354,7 +354,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0); + uint32_t lwe_idx, uint32_t max_shared_memory); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -364,7 +364,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0); + uint32_t lwe_idx, uint32_t max_shared_memory); #if (CUDA_ARCH >= 900) template @@ -375,7 +375,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0); + uint32_t lwe_idx, uint32_t max_shared_memory); template void scratch_cuda_programmable_bootstrap_tbc( diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h index f0f9efabd..a80035173 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h +++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h @@ -29,8 +29,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset, - uint32_t lwe_chunk_size = 0); + uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0); void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, uint32_t gpu_index, @@ -65,7 +64,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t gpu_offset, uint32_t lwe_chunk_size); + uint32_t lwe_chunk_size); #endif template @@ -92,7 +91,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t gpu_offset, uint32_t lwe_chunk_size = 0); + uint32_t lwe_chunk_size = 0); template void scratch_cuda_multi_bit_programmable_bootstrap( @@ -111,7 +110,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t gpu_offset, uint32_t lwe_chunk_size = 0); + uint32_t lwe_chunk_size = 0); template __host__ __device__ uint64_t diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu index 2373e418a..524a1fa45 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu @@ -9,16 +9,14 @@ void cuda_keyswitch_lwe_ciphertext_vector_32( void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t gpu_offset) { + uint32_t base_log, uint32_t level_count, uint32_t num_samples) { cuda_keyswitch_lwe_ciphertext_vector( static_cast(stream), gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(ksk), - lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples, - gpu_offset); + lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples); } /* Perform keyswitch on a batch of 64 bits input LWE ciphertexts. @@ -41,14 +39,12 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t gpu_offset) { + uint32_t base_log, uint32_t level_count, uint32_t num_samples) { cuda_keyswitch_lwe_ciphertext_vector( static_cast(stream), gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(ksk), - lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples, - gpu_offset); + lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples); } diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh index a1bc861f7..db78104a9 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh @@ -43,23 +43,20 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, const Torus *__restrict__ lwe_array_in, const Torus *__restrict__ lwe_input_indexes, const Torus *__restrict__ ksk, uint32_t lwe_dimension_in, - uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, - int gpu_offset) { + uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; const int shmem_index = threadIdx.x + threadIdx.y * blockDim.x; extern __shared__ int8_t sharedmem[]; Torus *lwe_acc_out = (Torus *)sharedmem; - auto block_lwe_array_out = - get_chunk(lwe_array_out, lwe_output_indexes[blockIdx.y + gpu_offset], - lwe_dimension_out + 1); + auto block_lwe_array_out = get_chunk( + lwe_array_out, lwe_output_indexes[blockIdx.y], lwe_dimension_out + 1); if (tid <= lwe_dimension_out) { Torus local_lwe_out = 0; - auto block_lwe_array_in = - get_chunk(lwe_array_in, lwe_input_indexes[blockIdx.y + gpu_offset], - lwe_dimension_in + 1); + auto block_lwe_array_in = get_chunk( + lwe_array_in, lwe_input_indexes[blockIdx.y], lwe_dimension_in + 1); if (tid == lwe_dimension_out && threadIdx.y == 0) { local_lwe_out = block_lwe_array_in[lwe_dimension_in]; @@ -105,8 +102,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t gpu_offset = 0) { + uint32_t base_log, uint32_t level_count, uint32_t num_samples) { cudaSetDevice(gpu_index); @@ -122,42 +118,40 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector( keyswitch<<>>( lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk, - lwe_dimension_in, lwe_dimension_out, base_log, level_count, gpu_offset); + lwe_dimension_in, lwe_dimension_out, base_log, level_count); check_cuda_error(cudaGetLastError()); } template -void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lwe_array_in, - Torus *lwe_input_indexes, Torus **ksks, - uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, - uint32_t num_samples, bool sync_streams = true) { +void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, + const LweArrayVariant &lwe_array_out, + const LweArrayVariant &lwe_output_indexes, + const LweArrayVariant &lwe_array_in, + const LweArrayVariant &lwe_input_indexes, + Torus **ksks, uint32_t lwe_dimension_in, + uint32_t lwe_dimension_out, uint32_t base_log, + uint32_t level_count, uint32_t num_samples) { /// If the number of radix blocks is lower than the number of GPUs, not all /// GPUs will be active and there will be 1 input per GPU - auto active_gpu_count = get_active_gpu_count(num_samples, gpu_count); - int num_samples_on_gpu_0 = get_num_inputs_on_gpu(num_samples, 0, gpu_count); - if (sync_streams) - cuda_synchronize_stream(streams[0], gpu_indexes[0]); -#pragma omp parallel for num_threads(active_gpu_count) - for (uint i = 0; i < active_gpu_count; i++) { + for (uint i = 0; i < gpu_count; i++) { int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count); - int gpu_offset = get_gpu_offset(num_samples, i, gpu_count); + + Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i); + Torus *current_lwe_output_indexes = + GET_VARIANT_ELEMENT(lwe_output_indexes, i); + Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i); + Torus *current_lwe_input_indexes = + GET_VARIANT_ELEMENT(lwe_input_indexes, i); // Compute Keyswitch cuda_keyswitch_lwe_ciphertext_vector( - streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes, - lwe_array_in, lwe_input_indexes, ksks[i], lwe_dimension_in, - lwe_dimension_out, base_log, level_count, num_samples_on_gpu, - gpu_offset); + streams[i], gpu_indexes[i], current_lwe_array_out, + current_lwe_output_indexes, current_lwe_array_in, + current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out, + base_log, level_count, num_samples_on_gpu); } - - if (sync_streams) - for (uint i = 0; i < active_gpu_count; i++) { - cuda_synchronize_stream(streams[i], gpu_indexes[i]); - } } #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh index bab675bf5..28b02a57f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh @@ -131,7 +131,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb( } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh index 43fe3b184..a1057a443 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh @@ -76,8 +76,10 @@ __host__ void host_integer_radix_cmux_kb( mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) { cuda_synchronize_stream(true_streams[j], gpu_indexes[j]); + } + for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) { cuda_synchronize_stream(false_streams[j], gpu_indexes[j]); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh index e1d3e8452..d56d6e329 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh @@ -245,7 +245,6 @@ __host__ void host_compare_with_zero_equality( int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, int32_t num_radix_blocks, int_radix_lut *zero_comparison) { - cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; auto message_modulus = params.message_modulus; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh index 9562fdbf1..f38c59167 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh @@ -31,17 +31,13 @@ template struct lwe_ciphertext_list { int_radix_params params; size_t big_lwe_size; - size_t radix_size; size_t big_lwe_size_bytes; - size_t radix_size_bytes; size_t big_lwe_dimension; lwe_ciphertext_list(Torus *src, int_radix_params params, size_t max_blocks) : data(src), params(params), max_blocks(max_blocks) { big_lwe_size = params.big_lwe_dimension + 1; big_lwe_size_bytes = big_lwe_size * sizeof(Torus); - radix_size = max_blocks * big_lwe_size; - radix_size_bytes = radix_size * sizeof(Torus); big_lwe_dimension = params.big_lwe_dimension; len = max_blocks; } @@ -404,7 +400,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, gpu_count); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]); @@ -514,7 +510,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, gpu_indexes, gpu_count); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]); @@ -591,7 +587,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]); @@ -632,7 +628,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 7e216ccff..b0e8d4e7d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -3,6 +3,7 @@ #include "crypto/keyswitch.cuh" #include "device.h" +#include "helper_multi_gpu.h" #include "integer.h" #include "integer/scalar_addition.cuh" #include "linear_algebra.h" @@ -10,6 +11,7 @@ #include "polynomial/functions.cuh" #include "programmable_bootstrap.h" #include "utils/helper.cuh" +#include "utils/helper_multi_gpu.cuh" #include "utils/kernel_dimensions.cuh" #include @@ -153,28 +155,69 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( auto polynomial_size = params.polynomial_size; auto grouping_factor = params.grouping_factor; - cuda_synchronize_stream(streams[0], gpu_indexes[0]); - /// Apply KS to go from a big LWE dimension to a small LWE dimension - execute_keyswitch(streams, gpu_indexes, gpu_count, - lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, - lwe_array_in, lut->lwe_indexes_in, ksks, - big_lwe_dimension, small_lwe_dimension, ks_base_log, - ks_level, num_radix_blocks, false); + /// For multi GPU execution we create vectors of pointers for inputs and + /// outputs + std::vector lwe_array_in_vec = lut->lwe_array_in_vec; + std::vector lwe_after_ks_vec = lut->lwe_after_ks_vec; + std::vector lwe_after_pbs_vec = lut->lwe_after_pbs_vec; + std::vector lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec; - /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE - /// dimension to a big LWE dimension - execute_pbs( - streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out, - lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks, - lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension, - small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, - grouping_factor, num_radix_blocks, 1, 0, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); - - /// Synchronize all GPUs auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); - for (uint i = 0; i < active_gpu_count; i++) { - cuda_synchronize_stream(streams[i], gpu_indexes[i]); + if (active_gpu_count == 1) { + execute_keyswitch_async(streams, gpu_indexes, 1, lwe_after_ks_vec[0], + lwe_trivial_indexes_vec[0], lwe_array_in, + lut->lwe_indexes_in, ksks, big_lwe_dimension, + small_lwe_dimension, ks_base_log, ks_level, + num_radix_blocks); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs_async( + streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out, + lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], + lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, + small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, + grouping_factor, num_radix_blocks, 1, 0, + cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); + } else { + /// Make sure all data that should be on GPU 0 is indeed there + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + + /// With multiple GPUs we push to the vectors on each GPU then when we + /// gather data to GPU 0 we can copy back to the original indexing + multi_gpu_scatter_lwe_async( + streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in, + lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks, + big_lwe_dimension + 1); + + /// Apply KS to go from a big LWE dimension to a small LWE dimension + execute_keyswitch_async(streams, gpu_indexes, active_gpu_count, + lwe_after_ks_vec, lwe_trivial_indexes_vec, + lwe_array_in_vec, lwe_trivial_indexes_vec, + ksks, big_lwe_dimension, small_lwe_dimension, + ks_base_log, ks_level, num_radix_blocks); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs_async( + streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec, + lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, + lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, + glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, + pbs_level, grouping_factor, num_radix_blocks, 1, 0, + cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); + + /// Copy data back to GPU 0 and release vecs + multi_gpu_gather_lwe_async(streams, gpu_indexes, active_gpu_count, + lwe_array_out, lwe_after_pbs_vec, + lut->h_lwe_indexes_out, + lut->using_trivial_lwe_indexes, + num_radix_blocks, big_lwe_dimension + 1); + + /// Synchronize all GPUs + for (uint i = 0; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } } } @@ -205,29 +248,65 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( num_radix_blocks); check_cuda_error(cudaGetLastError()); - cuda_synchronize_stream(streams[0], gpu_indexes[0]); + /// For multi GPU execution we create vectors of pointers for inputs and + /// outputs + std::vector lwe_array_in_vec = lut->lwe_array_in_vec; + std::vector lwe_after_ks_vec = lut->lwe_after_ks_vec; + std::vector lwe_after_pbs_vec = lut->lwe_after_pbs_vec; + std::vector lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec; - /// Apply KS to go from a big LWE dimension to a small LWE dimension - execute_keyswitch(streams, gpu_indexes, gpu_count, - lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, - lwe_array_pbs_in, lut->lwe_indexes_in, ksks, - big_lwe_dimension, small_lwe_dimension, ks_base_log, - ks_level, num_radix_blocks, false); - - /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE - /// dimension to a big LWE dimension - execute_pbs( - streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out, - lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks, - lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension, - small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, - grouping_factor, num_radix_blocks, 1, 0, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); - - /// Synchronize all GPUs auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); - for (uint i = 0; i < active_gpu_count; i++) { - cuda_synchronize_stream(streams[i], gpu_indexes[i]); + if (active_gpu_count == 1) { + execute_keyswitch_async(streams, gpu_indexes, 1, lwe_after_ks_vec[0], + lwe_trivial_indexes_vec[0], lwe_array_pbs_in, + lut->lwe_indexes_in, ksks, big_lwe_dimension, + small_lwe_dimension, ks_base_log, ks_level, + num_radix_blocks); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs_async( + streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out, + lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], + lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, + small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, + grouping_factor, num_radix_blocks, 1, 0, + cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); + } else { + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + multi_gpu_scatter_lwe_async( + streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, + lwe_array_pbs_in, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, + num_radix_blocks, big_lwe_dimension + 1); + + /// Apply KS to go from a big LWE dimension to a small LWE dimension + execute_keyswitch_async(streams, gpu_indexes, active_gpu_count, + lwe_after_ks_vec, lwe_trivial_indexes_vec, + lwe_array_in_vec, lwe_trivial_indexes_vec, + ksks, big_lwe_dimension, small_lwe_dimension, + ks_base_log, ks_level, num_radix_blocks); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs_async( + streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec, + lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, + lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, + glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, + pbs_level, grouping_factor, num_radix_blocks, 1, 0, + cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); + + /// Copy data back to GPU 0 and release vecs + multi_gpu_gather_lwe_async(streams, gpu_indexes, active_gpu_count, + lwe_array_out, lwe_after_pbs_vec, + lut->h_lwe_indexes_out, + lut->using_trivial_lwe_indexes, + num_radix_blocks, big_lwe_dimension + 1); + + /// Synchronize all GPUs + for (uint i = 0; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } } } @@ -325,7 +404,6 @@ void generate_device_accumulator_bivariate( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus, std::function f) { - cudaSetDevice(gpu_index); // host lut Torus *h_lut = (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus)); @@ -357,7 +435,6 @@ void generate_device_accumulator_bivariate_with_factor( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus, std::function f, int factor) { - cudaSetDevice(gpu_index); // host lut Torus *h_lut = (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus)); @@ -595,10 +672,10 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, cudaSetDevice(gpu_indexes[0]); /// Since the keyswitch is done on one input only, use only 1 GPU - cuda_keyswitch_lwe_ciphertext_vector( - streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector, + execute_keyswitch_async( + streams, gpu_indexes, 1, mem_ptr->tmp_small_lwe_vector, mem_ptr->lut->lwe_trivial_indexes, cur_input_block, - mem_ptr->lut->lwe_trivial_indexes, ksks[0], params.big_lwe_dimension, + mem_ptr->lut->lwe_trivial_indexes, ksks, params.big_lwe_dimension, params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1); cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size], @@ -606,7 +683,7 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, small_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]); - execute_pbs( + execute_pbs_async( streams, gpu_indexes, 1, mem_ptr->tmp_big_lwe_vector, mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec, mem_ptr->lut->lut_indexes_vec, mem_ptr->tmp_small_lwe_vector, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 39465f430..6aa721b08 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -8,11 +8,13 @@ #include "crypto/keyswitch.cuh" #include "device.h" +#include "helper_multi_gpu.h" #include "integer.h" #include "integer/integer.cuh" #include "linear_algebra.h" #include "programmable_bootstrap.h" #include "utils/helper.cuh" +#include "utils/helper_multi_gpu.cuh" #include "utils/kernel_dimensions.cuh" #include #include @@ -224,11 +226,12 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( auto message_modulus = mem_ptr->params.message_modulus; auto carry_modulus = mem_ptr->params.carry_modulus; auto num_blocks = num_blocks_in_radix; - auto big_lwe_size = mem_ptr->params.big_lwe_dimension + 1; + auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension; + auto big_lwe_size = big_lwe_dimension + 1; auto glwe_dimension = mem_ptr->params.glwe_dimension; auto polynomial_size = mem_ptr->params.polynomial_size; - auto lwe_dimension = mem_ptr->params.small_lwe_dimension; - auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension; + auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension; + auto small_lwe_size = small_lwe_dimension + 1; if (old_blocks != terms) { cuda_memcpy_async_gpu_to_gpu(old_blocks, terms, @@ -313,12 +316,10 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( auto lwe_indexes_in = luts_message_carry->lwe_indexes_in; auto lwe_indexes_out = luts_message_carry->lwe_indexes_out; + luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0], + h_lwe_idx_in, h_lwe_idx_out); size_t copy_size = total_count * sizeof(Torus); - cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size, - streams[0], gpu_indexes[0]); - cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size, - streams[0], gpu_indexes[0]); copy_size = sm_copy_count * sizeof(int32_t); cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size, streams[0], gpu_indexes[0]); @@ -338,27 +339,97 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); - auto active_gpu_count = get_active_gpu_count(total_count, gpu_count); - /// Apply KS to go from a big LWE dimension to a small LWE dimension - /// After this keyswitch execution, we need to synchronize the streams - /// because the keyswitch and PBS do not operate on the same number of - /// inputs - execute_keyswitch(streams, gpu_indexes, gpu_count, small_lwe_vector, - lwe_indexes_in, new_blocks, lwe_indexes_in, ksks, - polynomial_size * glwe_dimension, lwe_dimension, - mem_ptr->params.ks_base_log, - mem_ptr->params.ks_level, message_count, true); + /// For multi GPU execution we create vectors of pointers for inputs and + /// outputs + std::vector new_blocks_vec = luts_message_carry->lwe_array_in_vec; + std::vector small_lwe_vector_vec = + luts_message_carry->lwe_after_ks_vec; + std::vector lwe_after_pbs_vec = + luts_message_carry->lwe_after_pbs_vec; + std::vector lwe_trivial_indexes_vec = + luts_message_carry->lwe_trivial_indexes_vec; - /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE - /// dimension to a big LWE dimension - execute_pbs(streams, gpu_indexes, gpu_count, new_blocks, - lwe_indexes_out, luts_message_carry->lut_vec, - luts_message_carry->lut_indexes_vec, small_lwe_vector, - lwe_indexes_in, bsks, luts_message_carry->buffer, - glwe_dimension, lwe_dimension, polynomial_size, - mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, - mem_ptr->params.grouping_factor, total_count, 2, 0, - max_shared_memory, mem_ptr->params.pbs_type, true); + auto active_gpu_count = get_active_gpu_count(total_count, gpu_count); + if (active_gpu_count == 1) { + /// Apply KS to go from a big LWE dimension to a small LWE dimension + /// After this keyswitch execution, we need to synchronize the streams + /// because the keyswitch and PBS do not operate on the same number of + /// inputs + execute_keyswitch_async( + streams, gpu_indexes, 1, small_lwe_vector, lwe_indexes_in, new_blocks, + lwe_indexes_in, ksks, polynomial_size * glwe_dimension, + small_lwe_dimension, mem_ptr->params.ks_base_log, + mem_ptr->params.ks_level, message_count); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs_async( + streams, gpu_indexes, 1, new_blocks, lwe_indexes_out, + luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec, + small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer, + glwe_dimension, small_lwe_dimension, polynomial_size, + mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, + mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory, + mem_ptr->params.pbs_type); + } else { + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + + multi_gpu_scatter_lwe_async( + streams, gpu_indexes, active_gpu_count, new_blocks_vec, new_blocks, + luts_message_carry->h_lwe_indexes_in, + luts_message_carry->using_trivial_lwe_indexes, message_count, + big_lwe_size); + + /// Apply KS to go from a big LWE dimension to a small LWE dimension + /// After this keyswitch execution, we need to synchronize the streams + /// because the keyswitch and PBS do not operate on the same number of + /// inputs + execute_keyswitch_async( + streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec, + lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec, + ksks, big_lwe_dimension, small_lwe_dimension, + mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count); + + /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a + /// different configuration + multi_gpu_gather_lwe_async( + streams, gpu_indexes, gpu_count, small_lwe_vector, + small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in, + luts_message_carry->using_trivial_lwe_indexes, message_count, + small_lwe_size); + /// Synchronize all GPUs + for (uint i = 0; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } + + multi_gpu_scatter_lwe_async( + streams, gpu_indexes, gpu_count, small_lwe_vector_vec, + small_lwe_vector, luts_message_carry->h_lwe_indexes_in, + luts_message_carry->using_trivial_lwe_indexes, total_count, + small_lwe_size); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs_async( + streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec, + lwe_trivial_indexes_vec, luts_message_carry->lut_vec, + luts_message_carry->lut_indexes_vec, small_lwe_vector_vec, + lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer, + glwe_dimension, small_lwe_dimension, polynomial_size, + mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, + mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory, + mem_ptr->params.pbs_type); + + multi_gpu_gather_lwe_async( + streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec, + luts_message_carry->h_lwe_indexes_out, + luts_message_carry->using_trivial_lwe_indexes, total_count, + big_lwe_size); + /// Synchronize all GPUs + for (uint i = 0; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } + } int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0; int new_blocks_created = 2 * ch_amount * num_blocks; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh index 3aba53cce..a8b688239 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh @@ -133,7 +133,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]); cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]); } @@ -205,7 +205,6 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( std::function sign_handler_f, void **bsks, Torus **ksks, uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { - cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; auto glwe_dimension = params.glwe_dimension; @@ -397,7 +396,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( signed_msb_lut->params.message_modulus); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]); cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]); } @@ -465,7 +464,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]); cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]); } @@ -737,7 +736,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb( } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]); cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh index cfc9a6773..e612c9ab2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh @@ -245,7 +245,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( } } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem->active_gpu_count; j++) { cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]); } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index 4b5a2f1f3..cc5bd54fa 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -1,14 +1,13 @@ #ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_CUH #define CUDA_PROGRAMMABLE_BOOTSTRAP_CUH +#include "cooperative_groups.h" #include "device.h" #include "fft/bnsmfft.cuh" +#include "helper_multi_gpu.h" #include "programmable_bootstrap.h" #include "programmable_bootstrap_multibit.h" -#include "cooperative_groups.h" -#include "helper_multi_gpu.h" - using namespace cooperative_groups; namespace cg = cooperative_groups; @@ -118,22 +117,18 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer, } template -void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *lwe_array_out, - Torus *lwe_output_indexes, std::vector lut_vec, - std::vector lut_indexes_vec, Torus *lwe_array_in, - Torus *lwe_input_indexes, void **bootstrapping_keys, - std::vector pbs_buffer, uint32_t glwe_dimension, - uint32_t lwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, - uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count, - uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory, PBS_TYPE pbs_type, - bool sync_streams = true) { - auto active_gpu_count = - get_active_gpu_count(input_lwe_ciphertext_count, gpu_count); - if (sync_streams) - cuda_synchronize_stream(streams[0], gpu_indexes[0]); +void execute_pbs_async( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + const LweArrayVariant &lwe_array_out, + const LweArrayVariant &lwe_output_indexes, + std::vector lut_vec, std::vector lut_indexes_vec, + const LweArrayVariant &lwe_array_in, + const LweArrayVariant &lwe_input_indexes, void **bootstrapping_keys, + std::vector pbs_buffer, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t grouping_factor, + uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx, + uint32_t max_shared_memory, PBS_TYPE pbs_type) { switch (sizeof(Torus)) { case sizeof(uint32_t): // 32 bits @@ -141,20 +136,32 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes, case MULTI_BIT: PANIC("Error: 32-bit multibit PBS is not supported.\n") case CLASSICAL: -#pragma omp parallel for num_threads(active_gpu_count) - for (uint i = 0; i < active_gpu_count; i++) { + for (uint i = 0; i < gpu_count; i++) { int num_inputs_on_gpu = get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count); + int gpu_offset = get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count); auto d_lut_vector_indexes = lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset); + + // Use the macro to get the correct elements for the current iteration + // Handles the case when the input/output are scattered through + // different gpus and when it is not + Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i); + Torus *current_lwe_output_indexes = + GET_VARIANT_ELEMENT(lwe_output_indexes, i); + Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i); + Torus *current_lwe_input_indexes = + GET_VARIANT_ELEMENT(lwe_input_indexes, i); + cuda_programmable_bootstrap_lwe_ciphertext_vector_32( - streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes, - lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes, + streams[i], gpu_indexes[i], current_lwe_array_out, + current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes, + current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts, - lwe_idx, max_shared_memory, gpu_offset); + lwe_idx, max_shared_memory); } break; default: @@ -168,38 +175,61 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes, case MULTI_BIT: if (grouping_factor == 0) PANIC("Multi-bit PBS error: grouping factor should be > 0.") -#pragma omp parallel for num_threads(active_gpu_count) - for (uint i = 0; i < active_gpu_count; i++) { + for (uint i = 0; i < gpu_count; i++) { int num_inputs_on_gpu = get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count); + + // Use the macro to get the correct elements for the current iteration + // Handles the case when the input/output are scattered through + // different gpus and when it is not + Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i); + Torus *current_lwe_output_indexes = + GET_VARIANT_ELEMENT(lwe_output_indexes, i); + Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i); + Torus *current_lwe_input_indexes = + GET_VARIANT_ELEMENT(lwe_input_indexes, i); + int gpu_offset = get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count); auto d_lut_vector_indexes = lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset); + cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( - streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes, - lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes, + streams[i], gpu_indexes[i], current_lwe_array_out, + current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes, + current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_inputs_on_gpu, num_luts, lwe_idx, max_shared_memory, - gpu_offset); + num_inputs_on_gpu, num_luts, lwe_idx, max_shared_memory); } break; case CLASSICAL: -#pragma omp parallel for num_threads(active_gpu_count) - for (uint i = 0; i < active_gpu_count; i++) { + for (uint i = 0; i < gpu_count; i++) { int num_inputs_on_gpu = get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count); + + // Use the macro to get the correct elements for the current iteration + // Handles the case when the input/output are scattered through + // different gpus and when it is not + Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i); + Torus *current_lwe_output_indexes = + GET_VARIANT_ELEMENT(lwe_output_indexes, i); + Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i); + Torus *current_lwe_input_indexes = + GET_VARIANT_ELEMENT(lwe_input_indexes, i); + int gpu_offset = get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count); auto d_lut_vector_indexes = lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset); + cuda_programmable_bootstrap_lwe_ciphertext_vector_64( - streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes, - lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes, + streams[i], gpu_indexes[i], current_lwe_array_out, + current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes, + current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts, - lwe_idx, max_shared_memory, gpu_offset); + lwe_idx, max_shared_memory); } break; default: @@ -210,11 +240,6 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes, PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer " "moduli are supported.") } - - if (sync_streams) - for (uint i = 0; i < active_gpu_count; i++) { - cuda_synchronize_stream(streams[i], gpu_indexes[i]); - } } template diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu index de2f01d5b..89c1d2a9c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu @@ -158,7 +158,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory, uint32_t gpu_offset) { + uint32_t max_shared_memory) { if (base_log > 32) PANIC("Cuda error (amortized PBS): base log should be > number of bits in " @@ -172,7 +172,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_samples, num_luts, lwe_idx, max_shared_memory); break; case 512: host_programmable_bootstrap_amortized>( @@ -181,7 +181,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_samples, num_luts, lwe_idx, max_shared_memory); break; case 1024: host_programmable_bootstrap_amortized>( @@ -190,7 +190,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_samples, num_luts, lwe_idx, max_shared_memory); break; case 2048: host_programmable_bootstrap_amortized>( @@ -199,7 +199,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_samples, num_luts, lwe_idx, max_shared_memory); break; case 4096: host_programmable_bootstrap_amortized>( @@ -208,7 +208,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_samples, num_luts, lwe_idx, max_shared_memory); break; case 8192: host_programmable_bootstrap_amortized>( @@ -217,7 +217,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_samples, num_luts, lwe_idx, max_shared_memory); break; case 16384: host_programmable_bootstrap_amortized>( @@ -226,7 +226,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_samples, num_luts, lwe_idx, max_shared_memory); break; default: PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported " @@ -307,7 +307,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory, uint32_t gpu_offset) { + uint32_t max_shared_memory) { if (base_log > 64) PANIC("Cuda error (amortized PBS): base log should be > number of bits in " @@ -321,7 +321,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_samples, num_luts, lwe_idx, max_shared_memory); break; case 512: host_programmable_bootstrap_amortized>( @@ -330,7 +330,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_samples, num_luts, lwe_idx, max_shared_memory); break; case 1024: host_programmable_bootstrap_amortized>( @@ -339,7 +339,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_samples, num_luts, lwe_idx, max_shared_memory); break; case 2048: host_programmable_bootstrap_amortized>( @@ -348,7 +348,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_samples, num_luts, lwe_idx, max_shared_memory); break; case 4096: host_programmable_bootstrap_amortized>( @@ -357,7 +357,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_samples, num_luts, lwe_idx, max_shared_memory); break; case 8192: host_programmable_bootstrap_amortized>( @@ -366,7 +366,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_samples, num_luts, lwe_idx, max_shared_memory); break; case 16384: host_programmable_bootstrap_amortized>( @@ -375,7 +375,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_samples, num_luts, lwe_idx, max_shared_memory); break; default: PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported " diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh index 0f1f98a2a..48536f137 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh @@ -55,7 +55,7 @@ __global__ void device_programmable_bootstrap_amortized( const double2 *__restrict__ bootstrapping_key, int8_t *device_mem, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t lwe_idx, - size_t device_memory_size_per_sample, uint32_t gpu_offset) { + size_t device_memory_size_per_sample) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is // much faster than global memory @@ -82,8 +82,7 @@ __global__ void device_programmable_bootstrap_amortized( (ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2); auto block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.x + gpu_offset] * - (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; const Torus *block_lut_vector = &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * (glwe_dimension + 1)]; @@ -201,7 +200,7 @@ __global__ void device_programmable_bootstrap_amortized( } auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] * + &lwe_array_out[lwe_output_indexes[blockIdx.x] * (glwe_dimension * polynomial_size + 1)]; // The blind rotation for this block is over @@ -302,7 +301,7 @@ __host__ void host_programmable_bootstrap_amortized( int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory, uint32_t gpu_offset) { + uint32_t max_shared_memory) { cudaSetDevice(gpu_index); uint64_t SM_FULL = @@ -336,14 +335,14 @@ __host__ void host_programmable_bootstrap_amortized( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, - level_count, lwe_idx, DM_FULL, gpu_offset); + level_count, lwe_idx, DM_FULL); } else if (max_shared_memory < SM_FULL) { device_programmable_bootstrap_amortized <<>>( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, - level_count, lwe_idx, DM_PART, gpu_offset); + level_count, lwe_idx, DM_PART); } else { // For devices with compute capability 7.x a single thread block can // address the full capacity of shared memory. Shared memory on the @@ -355,7 +354,7 @@ __host__ void host_programmable_bootstrap_amortized( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, - level_count, lwe_idx, 0, gpu_offset); + level_count, lwe_idx, 0); } check_cuda_error(cudaGetLastError()); } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index 681ce1391..a7a14d996 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -44,7 +44,7 @@ __global__ void device_programmable_bootstrap_cg( const double2 *__restrict__ bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *device_mem, - uint64_t device_memory_size_per_block, uint32_t gpu_offset) { + uint64_t device_memory_size_per_block) { grid_group grid = this_grid(); @@ -78,8 +78,7 @@ __global__ void device_programmable_bootstrap_cg( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps const Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * - (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)]; const Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * @@ -144,7 +143,7 @@ __global__ void device_programmable_bootstrap_cg( } auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] * + &lwe_array_out[lwe_output_indexes[blockIdx.z] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -206,7 +205,7 @@ __host__ void host_programmable_bootstrap_cg( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) { + uint32_t num_luts, uint32_t max_shared_memory) { cudaSetDevice(gpu_index); // With SM each block corresponds to either the mask or body, no need to @@ -228,7 +227,7 @@ __host__ void host_programmable_bootstrap_cg( int thds = polynomial_size / params::opt; dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count); - void *kernel_args[15]; + void *kernel_args[14]; kernel_args[0] = &lwe_array_out; kernel_args[1] = &lwe_output_indexes; kernel_args[2] = &lut_vector; @@ -242,7 +241,6 @@ __host__ void host_programmable_bootstrap_cg( kernel_args[10] = &base_log; kernel_args[11] = &level_count; kernel_args[12] = &d_mem; - kernel_args[14] = &gpu_offset; if (max_shared_memory < partial_sm) { kernel_args[13] = &full_dm; diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index 18cce4e8e..59dfd2abf 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -29,7 +29,7 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate( uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, int8_t *device_mem, - uint64_t device_memory_size_per_block, uint32_t gpu_offset) { + uint64_t device_memory_size_per_block) { grid_group grid = this_grid(); @@ -58,8 +58,7 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps const Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * - (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)]; const Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * @@ -121,7 +120,7 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate( if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) { auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] * + &lwe_array_out[lwe_output_indexes[blockIdx.z] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -262,8 +261,7 @@ __host__ void execute_cg_external_product_loop( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset, - uint32_t gpu_offset) { + uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset) { cudaSetDevice(gpu_index); uint64_t full_dm = @@ -286,7 +284,7 @@ __host__ void execute_cg_external_product_loop( auto global_accumulator = buffer->global_accumulator; auto buffer_fft = buffer->global_accumulator_fft; - void *kernel_args[21]; + void *kernel_args[20]; kernel_args[0] = &lwe_array_out; kernel_args[1] = &lwe_output_indexes; kernel_args[2] = &lut_vector; @@ -306,7 +304,6 @@ __host__ void execute_cg_external_product_loop( kernel_args[16] = &chunk_size; kernel_args[17] = &keybundle_size_per_input; kernel_args[18] = &d_mem; - kernel_args[20] = &gpu_offset; dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples); dim3 thds(polynomial_size / params::opt, 1, 1); @@ -341,7 +338,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) { + uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); if (!lwe_chunk_size) @@ -356,7 +353,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, max_shared_memory, - lwe_chunk_size, lwe_offset, gpu_offset); + lwe_chunk_size, lwe_offset); // Accumulate execute_cg_external_product_loop( @@ -364,7 +361,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, lwe_chunk_size, - max_shared_memory, lwe_offset, gpu_offset); + max_shared_memory, lwe_offset); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu index 8f2a59e80..2875491d3 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu @@ -132,7 +132,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) { + uint32_t lwe_idx, uint32_t max_shared_memory) { switch (polynomial_size) { case 256: @@ -141,7 +141,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 512: host_programmable_bootstrap_tbc>( @@ -149,7 +149,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 1024: host_programmable_bootstrap_tbc>( @@ -157,7 +157,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 2048: host_programmable_bootstrap_tbc>( @@ -165,7 +165,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 4096: host_programmable_bootstrap_tbc>( @@ -173,7 +173,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 8192: host_programmable_bootstrap_tbc>( @@ -181,7 +181,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 16384: host_programmable_bootstrap_tbc>( @@ -189,7 +189,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -411,7 +411,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) { + uint32_t lwe_idx, uint32_t max_shared_memory) { switch (polynomial_size) { case 256: @@ -420,7 +420,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 512: host_programmable_bootstrap_cg>( @@ -428,7 +428,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 1024: host_programmable_bootstrap_cg>( @@ -436,7 +436,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 2048: host_programmable_bootstrap_cg>( @@ -444,7 +444,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 4096: host_programmable_bootstrap_cg>( @@ -452,7 +452,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 8192: host_programmable_bootstrap_cg>( @@ -460,7 +460,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 16384: host_programmable_bootstrap_cg>( @@ -468,7 +468,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -485,7 +485,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) { + uint32_t lwe_idx, uint32_t max_shared_memory) { switch (polynomial_size) { case 256: @@ -494,7 +494,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 512: host_programmable_bootstrap>( @@ -502,7 +502,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 1024: host_programmable_bootstrap>( @@ -510,7 +510,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 2048: host_programmable_bootstrap>( @@ -518,7 +518,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 4096: host_programmable_bootstrap>( @@ -526,7 +526,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 8192: host_programmable_bootstrap>( @@ -534,7 +534,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; case 16384: host_programmable_bootstrap>( @@ -542,7 +542,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory, gpu_offset); + num_luts, max_shared_memory); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -560,7 +560,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory, uint32_t gpu_offset) { + uint32_t max_shared_memory) { if (base_log > 32) PANIC("Cuda error (classical PBS): base log should be > number of bits " @@ -582,7 +582,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(bootstrapping_key), (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_luts, lwe_idx, max_shared_memory); #else PANIC("Cuda error (PBS): TBC pbs is not supported.") #endif @@ -598,7 +598,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(bootstrapping_key), (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_luts, lwe_idx, max_shared_memory); break; case DEFAULT: cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -611,7 +611,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(bootstrapping_key), (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_luts, lwe_idx, max_shared_memory); break; default: PANIC("Cuda error (PBS): unknown pbs variant.") @@ -697,7 +697,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory, uint32_t gpu_offset) { + uint32_t max_shared_memory) { if (base_log > 64) PANIC("Cuda error (classical PBS): base log should be > number of bits " "in the ciphertext representation (64)"); @@ -718,7 +718,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(bootstrapping_key), (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_luts, lwe_idx, max_shared_memory); #else PANIC("Cuda error (PBS): TBC pbs is not supported.") #endif @@ -734,7 +734,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(bootstrapping_key), (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_luts, lwe_idx, max_shared_memory); break; case PBS_VARIANT::DEFAULT: cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -747,7 +747,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(bootstrapping_key), (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory, gpu_offset); + num_luts, lwe_idx, max_shared_memory); break; default: PANIC("Cuda error (PBS): unknown pbs variant.") @@ -777,7 +777,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset); + uint32_t lwe_idx, uint32_t max_shared_memory); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, @@ -787,7 +787,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset); + uint32_t lwe_idx, uint32_t max_shared_memory); template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, @@ -810,7 +810,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset); + uint32_t lwe_idx, uint32_t max_shared_memory); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint32_t *lwe_array_out, @@ -820,7 +820,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset); + uint32_t lwe_idx, uint32_t max_shared_memory); template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, @@ -851,7 +851,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset); + uint32_t lwe_idx, uint32_t max_shared_memory); template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, uint64_t *lwe_output_indexes, uint64_t *lut_vector, @@ -860,7 +860,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset); + uint32_t lwe_idx, uint32_t max_shared_memory); template void scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index 0f4dec5fc..241370906 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -26,7 +26,7 @@ __global__ void device_programmable_bootstrap_step_one( double2 *global_accumulator_fft, uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *device_mem, - uint64_t device_memory_size_per_block, uint32_t gpu_offset) { + uint64_t device_memory_size_per_block) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is @@ -54,8 +54,7 @@ __global__ void device_programmable_bootstrap_step_one( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps const Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * - (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)]; const Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * @@ -140,7 +139,7 @@ __global__ void device_programmable_bootstrap_step_two( double2 *global_accumulator_fft, uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *device_mem, - uint64_t device_memory_size_per_block, uint32_t gpu_offset) { + uint64_t device_memory_size_per_block) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is @@ -206,7 +205,7 @@ __global__ void device_programmable_bootstrap_step_two( if (lwe_iteration + 1 == lwe_dimension) { // Last iteration auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] * + &lwe_array_out[lwe_output_indexes[blockIdx.x] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -331,17 +330,15 @@ __host__ void scratch_programmable_bootstrap( } template -__host__ void -execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, - Torus *lwe_input_indexes, double2 *bootstrapping_key, - Torus *global_accumulator, double2 *global_accumulator_fft, - uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, int8_t *d_mem, - uint32_t max_shared_memory, int lwe_iteration, - uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm, - uint64_t full_dm, uint32_t gpu_offset) { +__host__ void execute_step_one( + cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, + Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, + double2 *bootstrapping_key, Torus *global_accumulator, + double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count, + uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, int8_t *d_mem, + uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm, + uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) { cudaSetDevice(gpu_index); int thds = polynomial_size / params::opt; @@ -353,37 +350,35 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, full_dm, gpu_offset); + level_count, d_mem, full_dm); } else if (max_shared_memory < full_sm) { device_programmable_bootstrap_step_one <<>>( lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, partial_dm, gpu_offset); + level_count, d_mem, partial_dm); } else { device_programmable_bootstrap_step_one <<>>( lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, 0, gpu_offset); + level_count, d_mem, 0); } check_cuda_error(cudaGetLastError()); } template -__host__ void -execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, - Torus *lut_vector_indexes, double2 *bootstrapping_key, - Torus *global_accumulator, double2 *global_accumulator_fft, - uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, int8_t *d_mem, - uint32_t max_shared_memory, int lwe_iteration, - uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm, - uint64_t full_dm, uint32_t gpu_offset) { +__host__ void execute_step_two( + cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, + Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, + double2 *bootstrapping_key, Torus *global_accumulator, + double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count, + uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, int8_t *d_mem, + uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm, + uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) { cudaSetDevice(gpu_index); int thds = polynomial_size / params::opt; @@ -395,21 +390,21 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, full_dm, gpu_offset); + level_count, d_mem, full_dm); } else if (max_shared_memory < full_sm) { device_programmable_bootstrap_step_two <<>>( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, partial_dm, gpu_offset); + level_count, d_mem, partial_dm); } else { device_programmable_bootstrap_step_two <<>>( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, 0, gpu_offset); + level_count, d_mem, 0); } check_cuda_error(cudaGetLastError()); } @@ -424,7 +419,7 @@ __host__ void host_programmable_bootstrap( pbs_buffer *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) { + uint32_t num_luts, uint32_t max_shared_memory) { cudaSetDevice(gpu_index); // With SM each block corresponds to either the mask or body, no need to @@ -455,14 +450,14 @@ __host__ void host_programmable_bootstrap( global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, d_mem, max_shared_memory, i, partial_sm, partial_dm_step_one, full_sm_step_one, - full_dm_step_one, gpu_offset); + full_dm_step_one); execute_step_two( stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, d_mem, max_shared_memory, i, partial_sm, partial_dm_step_two, full_sm_step_two, - full_dm_step_two, gpu_offset); + full_dm_step_two); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu index f04a3c8be..e994e203a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu @@ -75,7 +75,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t gpu_offset, uint32_t lwe_chunk_size) { + uint32_t lwe_chunk_size) { if (base_log > 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -89,8 +89,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset, - lwe_chunk_size); + num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); break; case 512: host_cg_multi_bit_programmable_bootstrap 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -182,8 +175,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset, - lwe_chunk_size); + num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); break; case 512: host_multi_bit_programmable_bootstrap>( @@ -191,8 +183,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset, - lwe_chunk_size); + num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); break; case 1024: host_multi_bit_programmable_bootstrap *buffer = (pbs_buffer *)mem_ptr; @@ -277,8 +262,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(bootstrapping_key), (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset, - lwe_chunk_size); + num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); #else PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.") #endif @@ -293,8 +277,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset, - lwe_chunk_size); + num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); break; case PBS_VARIANT::DEFAULT: cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -306,8 +289,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset, - lwe_chunk_size); + num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.") @@ -575,7 +557,7 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t gpu_offset, uint32_t lwe_chunk_size); + uint32_t lwe_chunk_size); template void scratch_cuda_cg_multi_bit_programmable_bootstrap( @@ -595,7 +577,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t gpu_offset, uint32_t lwe_chunk_size); + uint32_t lwe_chunk_size); template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( @@ -683,7 +665,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t gpu_offset, uint32_t lwe_chunk_size) { + uint32_t lwe_chunk_size) { if (base_log > 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -697,8 +679,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset, - lwe_chunk_size); + num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); break; case 512: host_tbc_multi_bit_programmable_bootstrap( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t gpu_offset, uint32_t lwe_chunk_size); + uint32_t lwe_chunk_size); #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index b2069d2f6..4e5b5fca2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -40,8 +40,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, - int8_t *device_mem, uint64_t device_memory_size_per_block, - uint32_t gpu_offset) { + int8_t *device_mem, uint64_t device_memory_size_per_block) { extern __shared__ int8_t sharedmem[]; int8_t *selected_memory = sharedmem; @@ -66,8 +65,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( Torus *accumulator = (Torus *)selected_memory; const Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[input_idx + gpu_offset] * - (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)]; double2 *keybundle = keybundle_array + // select the input @@ -157,7 +155,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one( double2 *global_accumulator_fft, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem, - uint64_t device_memory_size_per_block, uint32_t gpu_offset) { + uint64_t device_memory_size_per_block) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is @@ -184,8 +182,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one( accumulator_fft = (double2 *)sharedmem; const Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * - (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)]; const Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * @@ -254,7 +251,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset, uint32_t lwe_chunk_size, int8_t *device_mem, - uint64_t device_memory_size_per_block, uint32_t gpu_offset) { + uint64_t device_memory_size_per_block) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is // much faster than global memory @@ -318,7 +315,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two( if (lwe_iteration + 1 == (lwe_dimension / grouping_factor)) { // Last iteration auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] * + &lwe_array_out[lwe_output_indexes[blockIdx.x] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -492,8 +489,7 @@ __host__ void execute_compute_keybundle( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t max_shared_memory, uint32_t lwe_chunk_size, int lwe_offset, - uint32_t gpu_offset) { + uint32_t max_shared_memory, uint32_t lwe_chunk_size, int lwe_offset) { cudaSetDevice(gpu_index); uint32_t chunk_size = @@ -521,27 +517,26 @@ __host__ void execute_compute_keybundle( lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, lwe_offset, chunk_size, - keybundle_size_per_input, d_mem, full_sm_keybundle, gpu_offset); + keybundle_size_per_input, d_mem, full_sm_keybundle); else device_multi_bit_programmable_bootstrap_keybundle <<>>( lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, lwe_offset, chunk_size, - keybundle_size_per_input, d_mem, 0, gpu_offset); + keybundle_size_per_input, d_mem, 0); check_cuda_error(cudaGetLastError()); } template -__host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index, - Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, - pbs_buffer *buffer, - uint32_t num_samples, uint32_t lwe_dimension, - uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t max_shared_memory, - int j, int lwe_offset, uint32_t gpu_offset) { +__host__ void +execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, + Torus *lut_vector_indexes, Torus *lwe_array_in, + Torus *lwe_input_indexes, pbs_buffer *buffer, + uint32_t num_samples, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, + uint32_t max_shared_memory, int j, int lwe_offset) { cudaSetDevice(gpu_index); uint64_t full_sm_accumulate_step_one = @@ -566,7 +561,7 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index, lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes, global_accumulator, global_accumulator_fft, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, - j + lwe_offset, d_mem, full_sm_accumulate_step_one, gpu_offset); + j + lwe_offset, d_mem, full_sm_accumulate_step_one); else if (max_shared_memory < full_sm_accumulate_step_one) device_multi_bit_programmable_bootstrap_accumulate_step_one @@ -575,7 +570,7 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index, lut_vector_indexes, global_accumulator, global_accumulator_fft, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, j + lwe_offset, - d_mem, partial_sm_accumulate_step_one, gpu_offset); + d_mem, partial_sm_accumulate_step_one); else device_multi_bit_programmable_bootstrap_accumulate_step_one @@ -584,7 +579,7 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index, lut_vector_indexes, global_accumulator, global_accumulator_fft, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, j + lwe_offset, - d_mem, 0, gpu_offset); + d_mem, 0); check_cuda_error(cudaGetLastError()); } @@ -596,7 +591,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count, uint32_t max_shared_memory, int j, - int lwe_offset, uint32_t lwe_chunk_size, uint32_t gpu_offset) { + int lwe_offset, uint32_t lwe_chunk_size) { cudaSetDevice(gpu_index); uint64_t full_sm_accumulate_step_two = @@ -618,8 +613,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, lwe_array_out, lwe_output_indexes, keybundle_fft, global_accumulator, global_accumulator_fft, lwe_dimension, glwe_dimension, polynomial_size, level_count, grouping_factor, j, - lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two, - gpu_offset); + lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two); else device_multi_bit_programmable_bootstrap_accumulate_step_two @@ -627,8 +621,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, stream>>>(lwe_array_out, lwe_output_indexes, keybundle_fft, global_accumulator, global_accumulator_fft, lwe_dimension, glwe_dimension, polynomial_size, level_count, - grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0, - gpu_offset); + grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0); check_cuda_error(cudaGetLastError()); } @@ -641,7 +634,7 @@ __host__ void host_multi_bit_programmable_bootstrap( uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) { + uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); // If a chunk size is not passed to this function, select one. @@ -657,7 +650,7 @@ __host__ void host_multi_bit_programmable_bootstrap( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, max_shared_memory, - lwe_chunk_size, lwe_offset, gpu_offset); + lwe_chunk_size, lwe_offset); // Accumulate uint32_t chunk_size = std::min( lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); @@ -666,13 +659,13 @@ __host__ void host_multi_bit_programmable_bootstrap( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, max_shared_memory, j, - lwe_offset, gpu_offset); + lwe_offset); execute_step_two( stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, level_count, max_shared_memory, j, lwe_offset, - lwe_chunk_size, gpu_offset); + lwe_chunk_size); } } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index ff379bb91..ff5651806 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -44,8 +44,7 @@ __global__ void device_programmable_bootstrap_tbc( const double2 *__restrict__ bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *device_mem, - uint64_t device_memory_size_per_block, bool support_dsm, - uint32_t gpu_offset) { + uint64_t device_memory_size_per_block, bool support_dsm) { cluster_group cluster = this_cluster(); @@ -82,8 +81,7 @@ __global__ void device_programmable_bootstrap_tbc( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps const Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * - (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)]; const Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * @@ -148,7 +146,7 @@ __global__ void device_programmable_bootstrap_tbc( } auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] * + &lwe_array_out[lwe_output_indexes[blockIdx.z] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -229,7 +227,7 @@ __host__ void host_programmable_bootstrap_tbc( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) { + uint32_t num_luts, uint32_t max_shared_memory) { cudaSetDevice(gpu_index); auto supports_dsm = @@ -283,7 +281,7 @@ __host__ void host_programmable_bootstrap_tbc( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm, - supports_dsm, gpu_offset)); + supports_dsm)); } else if (max_shared_memory < full_sm + minimum_sm_tbc) { config.dynamicSmemBytes = partial_sm + minimum_sm_tbc; @@ -292,7 +290,7 @@ __host__ void host_programmable_bootstrap_tbc( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, lwe_dimension, polynomial_size, base_log, level_count, d_mem, - partial_dm, supports_dsm, gpu_offset)); + partial_dm, supports_dsm)); } else { config.dynamicSmemBytes = full_sm + minimum_sm_tbc; @@ -301,7 +299,7 @@ __host__ void host_programmable_bootstrap_tbc( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0, - supports_dsm, gpu_offset)); + supports_dsm)); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 23f74a2d6..0aa711543 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -29,8 +29,7 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate( uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, int8_t *device_mem, - uint64_t device_memory_size_per_block, bool support_dsm, - uint32_t gpu_offset) { + uint64_t device_memory_size_per_block, bool support_dsm) { cluster_group cluster = this_cluster(); @@ -66,8 +65,7 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps const Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * - (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)]; const Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * @@ -129,7 +127,7 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate( if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) { auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] * + &lwe_array_out[lwe_output_indexes[blockIdx.z] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -273,8 +271,7 @@ __host__ void execute_tbc_external_product_loop( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset, - uint32_t gpu_offset) { + uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset) { cudaSetDevice(gpu_index); auto supports_dsm = @@ -334,7 +331,7 @@ __host__ void execute_tbc_external_product_loop( lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft, global_accumulator, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, grouping_factor, lwe_offset, chunk_size, - keybundle_size_per_input, d_mem, full_dm, supports_dsm, gpu_offset)); + keybundle_size_per_input, d_mem, full_dm, supports_dsm)); } else if (max_shared_memory < full_dm + minimum_dm) { config.dynamicSmemBytes = partial_dm + minimum_dm; check_cuda_error(cudaLaunchKernelEx( @@ -345,7 +342,7 @@ __host__ void execute_tbc_external_product_loop( lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft, global_accumulator, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, grouping_factor, lwe_offset, chunk_size, - keybundle_size_per_input, d_mem, partial_dm, supports_dsm, gpu_offset)); + keybundle_size_per_input, d_mem, partial_dm, supports_dsm)); } else { config.dynamicSmemBytes = full_dm + minimum_dm; check_cuda_error(cudaLaunchKernelEx( @@ -356,7 +353,7 @@ __host__ void execute_tbc_external_product_loop( lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft, global_accumulator, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, grouping_factor, lwe_offset, chunk_size, - keybundle_size_per_input, d_mem, 0, supports_dsm, gpu_offset)); + keybundle_size_per_input, d_mem, 0, supports_dsm)); } } @@ -369,7 +366,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) { + uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); if (!lwe_chunk_size) @@ -384,7 +381,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, max_shared_memory, - lwe_chunk_size, lwe_offset, gpu_offset); + lwe_chunk_size, lwe_offset); // Accumulate execute_tbc_external_product_loop( @@ -392,7 +389,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, lwe_chunk_size, - max_shared_memory, lwe_offset, gpu_offset); + max_shared_memory, lwe_offset); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu index 8e8189e70..a6d6cdd54 100644 --- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu +++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu @@ -1,5 +1,5 @@ #include "device.h" -#include "helper_multi_gpu.h" +#include "helper_multi_gpu.cuh" #include #include @@ -21,34 +21,14 @@ int cuda_setup_multi_gpu() { check_cuda_error( cudaDeviceCanAccessPeer(&has_peer_access_to_device_0, i, 0)); if (has_peer_access_to_device_0) { - cudaMemPool_t mempool; - cudaMemAccessDesc desc = {}; - // Enable P2P Access and mempool access check_cuda_error(cudaSetDevice(i)); check_cuda_error(cudaDeviceEnablePeerAccess(0, 0)); - - check_cuda_error(cudaDeviceGetDefaultMemPool(&mempool, 0)); - desc.location.type = cudaMemLocationTypeDevice; - desc.location.id = i; - desc.flags = cudaMemAccessFlagsProtReadWrite; - check_cuda_error( - cudaMemPoolSetAccess(mempool, &desc, 1 /* numDescs */)); - num_used_gpus += 1; - } else { - break; } + num_used_gpus += 1; } } else { - int has_peer_access_to_device_0; - for (int i = 1; i < num_gpus; i++) { - check_cuda_error( - cudaDeviceCanAccessPeer(&has_peer_access_to_device_0, i, 0)); - if (has_peer_access_to_device_0) { - num_used_gpus += 1; - } else { - break; - } - } + for (int i = 1; i < num_gpus; i++) + num_used_gpus += 1; } m.unlock(); } diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh new file mode 100644 index 000000000..08cb9dfe9 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh @@ -0,0 +1,133 @@ +#ifndef HELPER_MULTI_GPU_CUH +#define HELPER_MULTI_GPU_CUH + +#include "helper_multi_gpu.h" + +/// Initialize same-size arrays on all active gpus +template +void multi_gpu_alloc_array_async(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, std::vector &dest, + uint32_t elements_per_gpu) { + + dest.resize(gpu_count); + for (uint i = 0; i < gpu_count; i++) { + Torus *d_array = (Torus *)cuda_malloc_async( + elements_per_gpu * sizeof(Torus), streams[i], gpu_indexes[i]); + dest[i] = d_array; + } +} +/// Copy an array residing on one GPU to all active gpus +template +void multi_gpu_copy_array_async(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, std::vector &dest, + Torus *src, uint32_t elements_per_gpu) { + dest.resize(gpu_count); + for (uint i = 0; i < gpu_count; i++) { + cuda_memcpy_async_gpu_to_gpu(dest[i], src, elements_per_gpu * sizeof(Torus), + streams[i], gpu_indexes[i]); + } +} +/// Allocates the input/output vector for all devices +/// Initializes also the related indexing and initializes it to the trivial +/// index +template +void multi_gpu_alloc_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, std::vector &dest, + uint32_t num_inputs, uint32_t lwe_size) { + dest.resize(gpu_count); + for (uint i = 0; i < gpu_count; i++) { + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); + Torus *d_array = (Torus *)cuda_malloc_async( + inputs_on_gpu * lwe_size * sizeof(Torus), streams[i], gpu_indexes[i]); + dest[i] = d_array; + } +} + +/// Load an array residing on one GPU to all active gpus +/// and split the array among them. +/// The input indexing logic is given by an index array. +/// The output indexing is always the trivial one +template +void multi_gpu_scatter_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, std::vector &dest, + Torus *src, Torus *h_src_indexes, + bool is_trivial_index, uint32_t num_inputs, + uint32_t lwe_size) { + + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + dest.resize(gpu_count); + for (uint i = 0; i < gpu_count; i++) { + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); + auto gpu_offset = 0; + for (uint j = 0; j < i; j++) { + gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count); + } + + if (is_trivial_index) { + auto d_dest = dest[i]; + auto d_src = src + gpu_offset * lwe_size; + cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, + inputs_on_gpu * lwe_size * sizeof(Torus), + streams[i], gpu_indexes[i]); + + } else { + auto src_indexes = h_src_indexes + gpu_offset; + + for (uint j = 0; j < inputs_on_gpu; j++) { + auto d_dest = dest[i] + j * lwe_size; + auto d_src = src + src_indexes[j] * lwe_size; + + cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, lwe_size * sizeof(Torus), + streams[i], gpu_indexes[i]); + } + } + } +} + +/// Copy data from multiple GPUs back to GPU 0 following the indexing given in +/// dest_indexes +/// The input indexing should be the trivial one +template +void multi_gpu_gather_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, Torus *dest, + const std::vector &src, + Torus *h_dest_indexes, bool is_trivial_index, + uint32_t num_inputs, uint32_t lwe_size) { + + for (uint i = 0; i < gpu_count; i++) { + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); + auto gpu_offset = 0; + for (uint j = 0; j < i; j++) { + gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count); + } + + if (is_trivial_index) { + auto d_dest = dest + gpu_offset * lwe_size; + auto d_src = src[i]; + + cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, + inputs_on_gpu * lwe_size * sizeof(Torus), + streams[i], gpu_indexes[i]); + } else { + auto dest_indexes = h_dest_indexes + gpu_offset; + + for (uint j = 0; j < inputs_on_gpu; j++) { + auto d_dest = dest + dest_indexes[j] * lwe_size; + auto d_src = src[i] + j * lwe_size; + + cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, lwe_size * sizeof(Torus), + streams[i], gpu_indexes[i]); + } + } + } +} + +template +void multi_gpu_release_async(cudaStream_t *streams, uint32_t *gpu_indexes, + std::vector &vec) { + + for (uint i = 0; i < vec.size(); i++) + cuda_drop_async(vec[i], streams[i], gpu_indexes[i]); +} + +#endif diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index e36e3c94d..322be12f2 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -340,7 +340,6 @@ extern "C" { base_log: u32, level_count: u32, num_samples: u32, - gpu_offset: u32, ); /// Perform the negation of a u64 input LWE ciphertext vector. diff --git a/ci/ec2_products_cost.json b/ci/ec2_products_cost.json index 46b9c8328..3e9de534c 100644 --- a/ci/ec2_products_cost.json +++ b/ci/ec2_products_cost.json @@ -6,5 +6,6 @@ "p4d.24xlarge": 32.7726, "p5.48xlarge": 98.32, "rtx4090": 0.04, - "n3-H100x1": 4.30 + "n3-H100x1": 4.30, + "n3-H100x8-NVLink": 22.6 } diff --git a/ci/slab.toml b/ci/slab.toml index bbf2f11dc..4d5203d43 100644 --- a/ci/slab.toml +++ b/ci/slab.toml @@ -46,7 +46,7 @@ flavor_name = "n3-H100x1" [backend.hyperstack.multi-h100-nvlink] environment_name = "canada" image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2" -flavor_name = "n3-H100x8-NVLink-K8s" +flavor_name = "n3-H100x8-NVLink" [backend.hyperstack.multi-a100-nvlink] environment_name = "canada" diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 8eb516f2a..5a54d2a56 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -257,7 +257,6 @@ pub unsafe fn keyswitch_async( base_log.0 as u32, l_gadget.0 as u32, num_samples, - 0, ); }