diff --git a/.github/workflows/aws_tfhe_multi_gpu_tests.yml b/.github/workflows/aws_tfhe_multi_gpu_tests.yml new file mode 100644 index 000000000..87d3fa341 --- /dev/null +++ b/.github/workflows/aws_tfhe_multi_gpu_tests.yml @@ -0,0 +1,134 @@ +# Compile and test tfhe-cuda-backend on an AWS instance +name: TFHE Cuda Backend - Full tests multi-GPU + +env: + CARGO_TERM_COLOR: always + ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + RUSTFLAGS: "-C target-cpu=native" + RUST_BACKTRACE: "full" + RUST_MIN_STACK: "8388608" + SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }} + SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png + SLACK_USERNAME: ${{ secrets.BOT_USERNAME }} + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + +on: + # Allows you to run this workflow manually from the Actions tab as an alternative. + workflow_dispatch: + pull_request: + +jobs: + setup-instance: + name: Setup instance (cuda-tests-multi-gpu) + runs-on: ubuntu-latest + outputs: + runner-name: ${{ steps.start-instance.outputs.label }} + steps: + - name: Start instance + id: start-instance + uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d + with: + mode: start + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + backend: aws + profile: multi-gpu-test + + cuda-tests-linux: + name: CUDA multi-GPU tests + needs: [ setup-instance ] + concurrency: + group: ${{ github.workflow }}_${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + runs-on: ${{ needs.setup-instance.outputs.runner-name }} + strategy: + fail-fast: false + # explicit include-based build matrix, of known valid options + matrix: + include: + - os: ubuntu-22.04 + cuda: "12.2" + gcc: 9 + env: + CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} + + steps: + - name: Checkout tfhe-rs + uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b + + - name: Set up home + run: | + echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}" + + - name: Install latest stable + uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a + with: + toolchain: stable + + - name: Export CUDA variables + if: ${{ !cancelled() }} + run: | + echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}" + echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}" + echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}" + echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}" + + # Specify the correct host compilers + - name: Export gcc and g++ variables + if: ${{ !cancelled() }} + run: | + { + echo "CC=/usr/bin/gcc-${{ matrix.gcc }}"; + echo "CXX=/usr/bin/g++-${{ matrix.gcc }}"; + echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}"; + echo "HOME=/home/ubuntu"; + } >> "${GITHUB_ENV}" + + - name: Run core crypto, integer and internal CUDA backend tests + run: | + make test_gpu + + - name: Run user docs tests + run: | + make test_user_doc_gpu + + - name: Test C API + run: | + make test_c_api_gpu + + - name: Run High Level API Tests + run: | + make test_high_level_api_gpu + + - name: Slack Notification + if: ${{ always() }} + continue-on-error: true + uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 + env: + SLACK_COLOR: ${{ job.status }} + SLACK_MESSAGE: "CUDA AWS multi-GPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" + + teardown-instance: + name: Teardown instance (cuda-tests-multi-gpu) + if: ${{ always() && needs.setup-instance.result != 'skipped' }} + needs: [ setup-instance, cuda-tests-linux ] + runs-on: ubuntu-latest + steps: + - name: Stop instance + id: stop-instance + uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d + with: + mode: stop + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + label: ${{ needs.setup-instance.outputs.runner-name }} + + - name: Slack Notification + if: ${{ failure() }} + continue-on-error: true + uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 + env: + SLACK_COLOR: ${{ job.status }} + SLACK_MESSAGE: "Instance teardown (cuda-tests-multi-gpu) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" diff --git a/.github/workflows/integer_multi_gpu_full_benchmark.yml b/.github/workflows/integer_multi_gpu_full_benchmark.yml new file mode 100644 index 000000000..3dfa61f72 --- /dev/null +++ b/.github/workflows/integer_multi_gpu_full_benchmark.yml @@ -0,0 +1,184 @@ +# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot. +name: Integer multi GPU full benchmarks + +on: + workflow_dispatch: + schedule: + # Weekly benchmarks will be triggered each Saturday at 1a.m. + - cron: '0 1 * * 6' + +env: + CARGO_TERM_COLOR: always + RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json + ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + RUST_BACKTRACE: "full" + RUST_MIN_STACK: "8388608" + SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }} + SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png + SLACK_USERNAME: ${{ secrets.BOT_USERNAME }} + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + +jobs: + setup-instance: + name: Setup instance (cuda-integer-full-multi-gpu-benchmarks) + runs-on: ubuntu-latest + if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }} + outputs: + runner-name: ${{ steps.start-instance.outputs.label }} + steps: + - name: Start instance + id: start-instance + uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d + with: + mode: start + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + backend: aws + profile: multi-gpu-test + + cuda-integer-full-multi-gpu-benchmarks: + name: Execute multi GPU integer benchmarks for all operations flavor + needs: setup-instance + runs-on: ${{ needs.setup-instance.outputs.runner-name }} + timeout-minutes: 1440 # 24 hours + continue-on-error: true + strategy: + fail-fast: false + max-parallel: 1 + matrix: + command: [integer, integer_multi_bit] + op_flavor: [default, unchecked] + # explicit include-based build matrix, of known valid options + include: + - os: ubuntu-22.04 + cuda: "12.2" + gcc: 9 + env: + CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} + + steps: + - name: Checkout tfhe-rs repo with tags + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 + with: + fetch-depth: 0 + + - name: Get benchmark details + run: | + { + echo "BENCH_DATE=$(date --iso-8601=seconds)"; + echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"; + echo "COMMIT_HASH=$(git describe --tags --dirty)"; + } >> "${GITHUB_ENV}" + + - name: Set up home + # "Install rust" step require root user to have a HOME directory which is not set. + run: | + echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}" + + - name: Install rust + uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a + with: + toolchain: nightly + + - name: Export CUDA variables + if: ${{ !cancelled() }} + run: | + { + echo "CUDA_PATH=$CUDA_PATH"; + echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH"; + echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc"; + } >> "${GITHUB_ENV}" + echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}" + + # Specify the correct host compilers + - name: Export gcc and g++ variables + if: ${{ !cancelled() }} + run: | + { + echo "CC=/usr/bin/gcc-${{ matrix.gcc }}"; + echo "CXX=/usr/bin/g++-${{ matrix.gcc }}"; + echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}"; + } >> "${GITHUB_ENV}" + + - name: Checkout Slab repo + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 + with: + repository: zama-ai/slab + path: slab + token: ${{ secrets.FHE_ACTIONS_TOKEN }} + + - name: Run benchmarks with AVX512 + run: | + make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu + + - name: Parse results + run: | + python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \ + --database tfhe_rs \ + --hardware "p3.8xlarge" \ + --backend gpu \ + --project-version "${{ env.COMMIT_HASH }}" \ + --branch ${{ github.ref_name }} \ + --commit-date "${{ env.COMMIT_DATE }}" \ + --bench-date "${{ env.BENCH_DATE }}" \ + --walk-subdirs \ + --name-suffix avx512 \ + --throughput + + - name: Upload parsed results artifact + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 + with: + name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }} + path: ${{ env.RESULTS_FILENAME }} + + - name: Send data to Slab + shell: bash + run: | + echo "Computing HMac on results file" + SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')" + echo "Sending results to Slab..." + curl -v -k \ + -H "Content-Type: application/json" \ + -H "X-Slab-Repository: ${{ github.repository }}" \ + -H "X-Slab-Command: store_data_v2" \ + -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \ + -d @${{ env.RESULTS_FILENAME }} \ + ${{ secrets.SLAB_URL }} + + slack-notify: + name: Slack Notification + needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ] + runs-on: ubuntu-latest + if: ${{ !success() && !cancelled() }} + continue-on-error: true + steps: + - name: Send message + uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 + env: + SLACK_COLOR: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }} + SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})" + + teardown-instance: + name: Teardown instance (cuda-integer-full-multi-gpu-benchmarks) + if: ${{ always() && needs.setup-instance.result != 'skipped' }} + needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ] + runs-on: ubuntu-latest + steps: + - name: Stop instance + id: stop-instance + uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d + with: + mode: stop + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + label: ${{ needs.setup-instance.outputs.runner-name }} + + - name: Slack Notification + if: ${{ failure() }} + continue-on-error: true + uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 + env: + SLACK_COLOR: ${{ job.status }} + SLACK_MESSAGE: "Instance teardown (cuda-integer-full-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" diff --git a/Makefile b/Makefile index 15a154852..016614862 100644 --- a/Makefile +++ b/Makefile @@ -459,7 +459,7 @@ test_core_crypto_gpu: install_rs_build_toolchain .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend test_integer_gpu: install_rs_build_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \ - --features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: + --features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=6 RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \ --features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h index 69a35f7a7..e5d11b2bf 100644 --- a/backends/tfhe-cuda-backend/cuda/include/device.h +++ b/backends/tfhe-cuda-backend/cuda/include/device.h @@ -6,6 +6,7 @@ #include #include #include +#include #define synchronize_threads_in_block() __syncthreads() extern "C" { diff --git a/backends/tfhe-cuda-backend/cuda/include/helper.h b/backends/tfhe-cuda-backend/cuda/include/helper.h deleted file mode 100644 index 67cab1846..000000000 --- a/backends/tfhe-cuda-backend/cuda/include/helper.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef HELPER_H -#define HELPER_H - -extern "C" { -int cuda_setup_multi_gpu(); -} - -void multi_gpu_checks(uint32_t gpu_count); - -#endif diff --git a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h new file mode 100644 index 000000000..e928263b9 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h @@ -0,0 +1,18 @@ +#ifndef HELPER_MULTI_GPU_H +#define HELPER_MULTI_GPU_H +#include + +extern std::mutex m; +extern bool p2p_enabled; + +extern "C" { +int cuda_setup_multi_gpu(); +} + +int get_active_gpu_count(int num_inputs, int gpu_count); + +int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count); + +int get_gpu_offset(int total_num_inputs, int gpu_index, int gpu_count); + +#endif diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index 5ee704fc5..3724c53ca 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -40,19 +40,22 @@ enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 }; extern "C" { void scratch_cuda_apply_univariate_lut_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, void *input_lut, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, - uint32_t pbs_base_log, uint32_t grouping_factor, + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, - void *ksk, void *bsk, uint32_t num_blocks); + void **ksks, void **bsks, + uint32_t num_blocks); -void cleanup_cuda_apply_univariate_lut_kb_64(void *stream, uint32_t gpu_index, +void cleanup_cuda_apply_univariate_lut_kb_64(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); void scratch_cuda_full_propagation_64( @@ -64,7 +67,7 @@ void scratch_cuda_full_propagation_64( void cuda_full_propagation_64_inplace( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *input_blocks, int8_t *mem_ptr, void *ksk, void *bsk, + void *input_blocks, int8_t *mem_ptr, void **ksks, void **bsks, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_base_log, uint32_t ks_level, uint32_t pbs_base_log, uint32_t pbs_level, uint32_t grouping_factor, uint32_t num_blocks); @@ -73,7 +76,7 @@ void cleanup_cuda_full_propagation(void *stream, uint32_t gpu_index, int8_t **mem_ptr_void); void scratch_cuda_integer_mult_radix_ciphertext_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level, @@ -82,11 +85,12 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( void cuda_integer_mult_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right, void *bsk, - void *ksk, int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks); + void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right, + void **bsks, void **ksks, int8_t *mem_ptr, uint32_t polynomial_size, + uint32_t num_blocks); -void cleanup_cuda_integer_mult(void *stream, uint32_t gpu_index, - int8_t **mem_ptr_void); +void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); void cuda_negate_integer_radix_ciphertext_64_inplace( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, @@ -99,226 +103,239 @@ void cuda_scalar_addition_integer_radix_ciphertext_64_inplace( uint32_t message_modulus, uint32_t carry_modulus); void scratch_cuda_integer_radix_logical_scalar_shift_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory); void cuda_integer_radix_logical_scalar_shift_kb_64_inplace( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint32_t shift, int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks); + uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks, + uint32_t num_blocks); void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory); void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint32_t shift, int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks); + uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks, + uint32_t num_blocks); -void cleanup_cuda_integer_radix_logical_scalar_shift(void *stream, - uint32_t gpu_index, +void cleanup_cuda_integer_radix_logical_scalar_shift(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); -void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void *stream, - uint32_t gpu_index, +void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); void scratch_cuda_integer_radix_shift_and_rotate_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, - bool allocate_gpu_memory); + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, + bool is_signed, bool allocate_gpu_memory); void cuda_integer_radix_shift_and_rotate_kb_64_inplace( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - void *lwe_shift, int8_t *mem_ptr, void *bsk, void *ksk, + void *lwe_shift, int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks); -void cleanup_cuda_integer_radix_shift_and_rotate(void *stream, - uint32_t gpu_index, +void cleanup_cuda_integer_radix_shift_and_rotate(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); void scratch_cuda_integer_radix_comparison_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t lwe_ciphertext_count, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, COMPARISON_TYPE op_type, - bool is_signed, bool allocate_gpu_memory); + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t lwe_ciphertext_count, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory); void cuda_comparison_integer_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr, - void *bsk, void *ksk, uint32_t lwe_ciphertext_count); + void **bsks, void **ksks, uint32_t lwe_ciphertext_count); void cuda_scalar_comparison_integer_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array_out, void *lwe_array_in, void *scalar_blocks, - int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count, + int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks); -void cleanup_cuda_integer_comparison(void *stream, uint32_t gpu_index, - int8_t **mem_ptr_void); +void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); void scratch_cuda_integer_radix_bitop_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t lwe_ciphertext_count, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type, - bool allocate_gpu_memory); + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t lwe_ciphertext_count, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + BITOP_TYPE op_type, bool allocate_gpu_memory); void cuda_bitop_integer_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr, - void *bsk, void *ksk, uint32_t lwe_ciphertext_count); + void **bsks, void **ksks, uint32_t lwe_ciphertext_count); void cuda_bitnot_integer_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *lwe_array_in, int8_t *mem_ptr, void *bsk, - void *ksk, uint32_t lwe_ciphertext_count); + void *lwe_array_out, void *lwe_array_in, int8_t *mem_ptr, void **bsks, + void **ksks, uint32_t lwe_ciphertext_count); void cuda_scalar_bitop_integer_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array_out, void *lwe_array_input, void *clear_blocks, - uint32_t num_clear_blocks, int8_t *mem_ptr, void *bsk, void *ksk, + uint32_t num_clear_blocks, int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op); -void cleanup_cuda_integer_bitop(void *stream, uint32_t gpu_index, - int8_t **mem_ptr_void); +void cleanup_cuda_integer_bitop(void **streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); void scratch_cuda_integer_radix_cmux_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t lwe_ciphertext_count, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t lwe_ciphertext_count, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory); void cuda_cmux_integer_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array_out, void *lwe_condition, void *lwe_array_true, - void *lwe_array_false, int8_t *mem_ptr, void *bsk, void *ksk, + void *lwe_array_false, int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_ciphertext_count); -void cleanup_cuda_integer_radix_cmux(void *stream, uint32_t gpu_index, - int8_t **mem_ptr_void); +void cleanup_cuda_integer_radix_cmux(void **streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); void scratch_cuda_integer_radix_scalar_rotate_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory); void cuda_integer_radix_scalar_rotate_kb_64_inplace( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint32_t n, int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks); + uint32_t n, int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks); -void cleanup_cuda_integer_radix_scalar_rotate(void *stream, uint32_t gpu_index, +void cleanup_cuda_integer_radix_scalar_rotate(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); void scratch_cuda_propagate_single_carry_kb_64_inplace( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, bool allocate_gpu_memory); - -void cuda_propagate_single_carry_kb_64_inplace(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - void *lwe_array, void *carry_out, - int8_t *mem_ptr, void *bsk, - void *ksk, uint32_t num_blocks); - -void cleanup_cuda_propagate_single_carry(void *stream, uint32_t gpu_index, - int8_t **mem_ptr_void); - -void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks_in_radix, - uint32_t max_num_radix_in_vec, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); - -void cuda_integer_radix_sum_ciphertexts_vec_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec, - int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks_in_radix); - -void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void *stream, - uint32_t gpu_index, - int8_t **mem_ptr_void); - -void scratch_cuda_integer_radix_overflowing_sub_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, bool allocate_gpu_memory); - -void cuda_integer_radix_overflowing_sub_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void *radix_lwe_overflowed, void *radix_lwe_left, - void *radix_lwe_right, int8_t *mem_ptr, void *bsk, void *ksk, - uint32_t num_blocks_in_radix); - -void cleanup_cuda_integer_radix_overflowing_sub(void *stream, - uint32_t gpu_index, - int8_t **mem_ptr_void); - -void scratch_cuda_integer_scalar_mul_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level, + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); +void cuda_propagate_single_carry_kb_64_inplace( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, + void *carry_out, int8_t *mem_ptr, void **bsks, void **ksks, + uint32_t num_blocks); + +void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension, + uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, + uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory); + +void cuda_integer_radix_sum_ciphertexts_vec_kb_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec, + int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix); + +void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_integer_radix_overflowing_sub_kb_64( + void **stream, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); + +void cuda_integer_radix_overflowing_sub_kb_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *radix_lwe_out, void *radix_lwe_overflowed, void *radix_lwe_left, + void *radix_lwe_right, int8_t *mem_ptr, void **bsks, void **ksks, + uint32_t num_blocks_in_radix); + +void cleanup_cuda_integer_radix_overflowing_sub(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_integer_scalar_mul_kb_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension, + uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, + uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory); + void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, uint64_t *decomposed_scalar, uint64_t *has_at_least_one_set, - int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_dimension, + int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars); -void cleanup_cuda_integer_radix_scalar_mul(void *stream, uint32_t gpu_index, +void cleanup_cuda_integer_radix_scalar_mul(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, bool allocate_gpu_memory); + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); void cuda_integer_div_rem_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient, - void *remainder, void *numerator, void *divisor, int8_t *mem_ptr, void *bsk, - void *ksk, uint32_t num_blocks_in_radix); + void *remainder, void *numerator, void *divisor, int8_t *mem_ptr, + void **bsks, void **ksks, uint32_t num_blocks_in_radix); -void cleanup_cuda_integer_div_rem(void *stream, uint32_t gpu_index, - int8_t **mem_ptr_void); +void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); } // extern C @@ -336,7 +353,7 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in, size_t &carry_count, size_t &sm_copy_count); /* * generate bivariate accumulator (lut) for device pointer - * v_stream - cuda stream + * stream - cuda stream * acc_bivariate - device pointer for bivariate accumulator * ... * f - wrapping function with two Torus inputs @@ -354,7 +371,7 @@ void generate_device_accumulator_bivariate_with_factor( uint32_t carry_modulus, std::function f, int factor); /* * generate univariate accumulator (lut) for device pointer - * v_stream - cuda stream + * stream - cuda stream * acc - device pointer for univariate accumulator * ... * f - evaluating function with one Torus input @@ -414,93 +431,110 @@ template struct int_radix_lut { uint32_t num_blocks; bool mem_reuse = false; - int8_t *buffer; + // There will be one buffer on each GPU in multi-GPU computations + // (same for tmp lwe arrays) + std::vector buffer; + // These arrays will all reside on GPU 0 + // lut could actually be allocated & initialized GPU per GPU but this is not + // done at the moment + Torus *lut = nullptr; Torus *lut_indexes; + // All tmp lwe arrays and index arrays for lwe contain the total + // amount of blocks to be computed on, there is no split between GPUs + // for the moment Torus *lwe_indexes_in; Torus *lwe_indexes_out; - // lwe_trivial_indexes is the intermediary index we need in case // lwe_indexes_in != lwe_indexes_out Torus *lwe_trivial_indexes; - Torus *tmp_lwe_before_ks; Torus *tmp_lwe_after_ks; - Torus *lut = nullptr; - - int_radix_lut(cudaStream_t stream, uint32_t gpu_index, - int_radix_params params, uint32_t num_luts, + int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params params, uint32_t num_luts, uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; this->num_blocks = num_radix_blocks; Torus lut_indexes_size = num_radix_blocks * sizeof(Torus); - Torus big_size = - (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); - Torus small_size = - (params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); Torus lut_buffer_size = (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus); /////////////// - execute_scratch_pbs( - stream, gpu_index, &buffer, params.glwe_dimension, - params.small_lwe_dimension, params.polynomial_size, params.pbs_level, - params.grouping_factor, num_radix_blocks, - cuda_get_max_shared_memory(gpu_index), params.pbs_type, - allocate_gpu_memory); + auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + for (uint i = 0; i < active_gpu_count; i++) { + cudaSetDevice(i); + int8_t *gpu_pbs_buffer; + auto num_blocks_on_gpu = + get_num_inputs_on_gpu(num_radix_blocks, i, gpu_count); + + execute_scratch_pbs( + streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension, + params.small_lwe_dimension, params.polynomial_size, params.pbs_level, + params.grouping_factor, num_blocks_on_gpu, + cuda_get_max_shared_memory(gpu_indexes[i]), params.pbs_type, + allocate_gpu_memory); + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + buffer.push_back(gpu_pbs_buffer); + } if (allocate_gpu_memory) { // Allocate LUT // LUT is used as a trivial encryption and must be initialized outside // this constructor - lut = (Torus *)cuda_malloc_async(num_luts * lut_buffer_size, stream, - gpu_index); + lut = (Torus *)cuda_malloc_async(num_luts * lut_buffer_size, streams[0], + gpu_indexes[0]); - lut_indexes = - (Torus *)cuda_malloc_async(lut_indexes_size, stream, gpu_index); + lut_indexes = (Torus *)cuda_malloc_async(lut_indexes_size, streams[0], + gpu_indexes[0]); // lut_indexes is initialized to 0 by default // if a different behavior is wanted, it should be rewritten later - cuda_memset_async(lut_indexes, 0, lut_indexes_size, stream, gpu_index); + cuda_memset_async(lut_indexes, 0, lut_indexes_size, streams[0], + gpu_indexes[0]); // lwe_(input/output)_indexes are initialized to range(num_radix_blocks) // by default - lwe_indexes_in = - (Torus *)cuda_malloc(num_radix_blocks * sizeof(Torus), gpu_index); - lwe_indexes_out = - (Torus *)cuda_malloc(num_radix_blocks * sizeof(Torus), gpu_index); - lwe_trivial_indexes = - (Torus *)cuda_malloc(num_radix_blocks * sizeof(Torus), gpu_index); + lwe_indexes_in = (Torus *)cuda_malloc_async( + num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); + lwe_indexes_out = (Torus *)cuda_malloc_async( + num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); + lwe_trivial_indexes = (Torus *)cuda_malloc_async( + num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); for (int i = 0; i < num_radix_blocks; i++) h_lwe_indexes[i] = i; cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes, - num_radix_blocks * sizeof(Torus), stream, - gpu_index); + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes, - num_radix_blocks * sizeof(Torus), stream, - gpu_index); + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes, - num_radix_blocks * sizeof(Torus), stream, - gpu_index); - cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback, - h_lwe_indexes); + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_stream_add_callback(streams[0], gpu_indexes[0], + host_free_on_stream_callback, h_lwe_indexes); // Keyswitch + Torus big_size = + (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); + Torus small_size = + (params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); tmp_lwe_before_ks = - (Torus *)cuda_malloc_async(big_size, stream, gpu_index); + (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]); tmp_lwe_after_ks = - (Torus *)cuda_malloc_async(small_size, stream, gpu_index); + (Torus *)cuda_malloc_async(small_size, streams[0], gpu_indexes[0]); } } // constructor to reuse memory - int_radix_lut(cudaStream_t stream, uint32_t gpu_index, - int_radix_params params, uint32_t num_luts, + int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params params, uint32_t num_luts, uint32_t num_radix_blocks, int_radix_lut *base_lut_object) { this->params = params; @@ -522,40 +556,41 @@ template struct int_radix_lut { // Allocate LUT // LUT is used as a trivial encryption and must be initialized outside // this constructor - lut = (Torus *)cuda_malloc_async(num_luts * lut_buffer_size, stream, - gpu_index); + lut = (Torus *)cuda_malloc_async(num_luts * lut_buffer_size, streams[0], + gpu_indexes[0]); - lut_indexes = - (Torus *)cuda_malloc_async(lut_indexes_size, stream, gpu_index); + lut_indexes = (Torus *)cuda_malloc_async(lut_indexes_size, streams[0], + gpu_indexes[0]); // lut_indexes is initialized to 0 by default // if a different behavior is wanted, it should be rewritten later - cuda_memset_async(lut_indexes, 0, lut_indexes_size, stream, gpu_index); + cuda_memset_async(lut_indexes, 0, lut_indexes_size, streams[0], + gpu_indexes[0]); // lwe_(input/output)_indexes are initialized to range(num_radix_blocks) // by default lwe_indexes_in = (Torus *)cuda_malloc_async( - num_radix_blocks * sizeof(Torus), stream, gpu_index); + num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); lwe_indexes_out = (Torus *)cuda_malloc_async( - num_radix_blocks * sizeof(Torus), stream, gpu_index); + num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); lwe_trivial_indexes = (Torus *)cuda_malloc_async( - num_radix_blocks * sizeof(Torus), stream, gpu_index); + num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); for (int i = 0; i < num_radix_blocks; i++) h_lwe_indexes[i] = i; cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes, - num_radix_blocks * sizeof(Torus), stream, - gpu_index); + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes, - num_radix_blocks * sizeof(Torus), stream, - gpu_index); + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes, - num_radix_blocks * sizeof(Torus), stream, - gpu_index); - cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback, - h_lwe_indexes); + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_stream_add_callback(streams[0], gpu_indexes[0], + host_free_on_stream_callback, h_lwe_indexes); } Torus *get_lut(size_t ind) { @@ -565,26 +600,33 @@ template struct int_radix_lut { Torus *get_lut_indexes(size_t ind) { return &lut_indexes[ind]; } - void release(cudaStream_t stream, uint32_t gpu_index) { - cuda_drop_async(lut_indexes, stream, gpu_index); - cuda_drop_async(lwe_indexes_in, stream, gpu_index); - cuda_drop_async(lwe_indexes_out, stream, gpu_index); - cuda_drop_async(lwe_trivial_indexes, stream, gpu_index); - cuda_drop_async(lut, stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(lut_indexes, streams[0], gpu_indexes[0]); + cuda_drop_async(lwe_indexes_in, streams[0], gpu_indexes[0]); + cuda_drop_async(lwe_indexes_out, streams[0], gpu_indexes[0]); + cuda_drop_async(lwe_trivial_indexes, streams[0], gpu_indexes[0]); + cuda_drop_async(lut, streams[0], gpu_indexes[0]); if (!mem_reuse) { - switch (params.pbs_type) { - case MULTI_BIT: - cleanup_cuda_multi_bit_programmable_bootstrap(stream, gpu_index, - &buffer); - break; - case CLASSICAL: - cleanup_cuda_programmable_bootstrap(stream, gpu_index, &buffer); - break; - default: - PANIC("Cuda error (PBS): unknown PBS type. ") + cuda_drop_async(tmp_lwe_before_ks, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_lwe_after_ks, streams[0], gpu_indexes[0]); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + for (int i = 0; i < buffer.size(); i++) { + switch (params.pbs_type) { + case MULTI_BIT: + cleanup_cuda_multi_bit_programmable_bootstrap( + streams[i], gpu_indexes[i], &buffer[i]); + break; + case CLASSICAL: + cleanup_cuda_programmable_bootstrap(streams[i], gpu_indexes[i], + &buffer[i]); + break; + default: + PANIC("Cuda error (PBS): unknown PBS type. ") + } + cuda_synchronize_stream(streams[i], gpu_indexes[i]); } - cuda_drop_async(tmp_lwe_before_ks, stream, gpu_index); - cuda_drop_async(tmp_lwe_after_ks, stream, gpu_index); + buffer.clear(); } } }; @@ -594,15 +636,16 @@ template struct int_bit_extract_luts_buffer { int_radix_lut *lut; // With offset - int_bit_extract_luts_buffer(cudaStream_t stream, uint32_t gpu_index, - int_radix_params params, uint32_t bits_per_block, - uint32_t final_offset, uint32_t num_radix_blocks, + int_bit_extract_luts_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params params, + uint32_t bits_per_block, uint32_t final_offset, + uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; - lut = new int_radix_lut(stream, gpu_index, params, bits_per_block, - bits_per_block * num_radix_blocks, - allocate_gpu_memory); + lut = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, bits_per_block, + bits_per_block * num_radix_blocks, allocate_gpu_memory); if (allocate_gpu_memory) { for (int i = 0; i < bits_per_block; i++) { @@ -613,7 +656,7 @@ template struct int_bit_extract_luts_buffer { }; generate_device_accumulator( - stream, gpu_index, lut->get_lut(i), params.glwe_dimension, + streams[0], gpu_indexes[0], lut->get_lut(i), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, operator_f); } @@ -628,11 +671,12 @@ template struct int_bit_extract_luts_buffer { for (int i = 0; i < bits_per_block; i++) h_lut_indexes[i + j * bits_per_block] = i; } - cuda_memcpy_async_to_gpu( - lut->lut_indexes, h_lut_indexes, - num_radix_blocks * bits_per_block * sizeof(Torus), stream, gpu_index); - cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback, - h_lut_indexes); + cuda_memcpy_async_to_gpu(lut->lut_indexes, h_lut_indexes, + num_radix_blocks * bits_per_block * + sizeof(Torus), + streams[0], gpu_indexes[0]); + cuda_stream_add_callback(streams[0], gpu_indexes[0], + host_free_on_stream_callback, h_lut_indexes); /** * the input indexes should take the first bits_per_block PBS to target @@ -645,11 +689,12 @@ template struct int_bit_extract_luts_buffer { for (int i = 0; i < bits_per_block; i++) h_lwe_indexes_in[i + j * bits_per_block] = j; } - cuda_memcpy_async_to_gpu( - lut->lwe_indexes_in, h_lwe_indexes_in, - num_radix_blocks * bits_per_block * sizeof(Torus), stream, gpu_index); - cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback, - h_lwe_indexes_in); + cuda_memcpy_async_to_gpu(lut->lwe_indexes_in, h_lwe_indexes_in, + num_radix_blocks * bits_per_block * + sizeof(Torus), + streams[0], gpu_indexes[0]); + cuda_stream_add_callback(streams[0], gpu_indexes[0], + host_free_on_stream_callback, h_lwe_indexes_in); /** * the output should aim different lwe ciphertexts, so lwe_indexes_out = @@ -661,24 +706,29 @@ template struct int_bit_extract_luts_buffer { for (int i = 0; i < num_radix_blocks * bits_per_block; i++) h_lwe_indexes_out[i] = i; - cuda_memcpy_async_to_gpu( - lut->lwe_indexes_out, h_lwe_indexes_out, - num_radix_blocks * bits_per_block * sizeof(Torus), stream, gpu_index); - cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback, - h_lwe_indexes_out); + cuda_memcpy_async_to_gpu(lut->lwe_indexes_out, h_lwe_indexes_out, + num_radix_blocks * bits_per_block * + sizeof(Torus), + streams[0], gpu_indexes[0]); + cuda_stream_add_callback(streams[0], gpu_indexes[0], + host_free_on_stream_callback, h_lwe_indexes_out); } } // Without offset - int_bit_extract_luts_buffer(cudaStream_t stream, uint32_t gpu_index, - int_radix_params params, uint32_t bits_per_block, + int_bit_extract_luts_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params params, + uint32_t bits_per_block, uint32_t num_radix_blocks, bool allocate_gpu_memory) - : int_bit_extract_luts_buffer(stream, gpu_index, params, bits_per_block, - 0, num_radix_blocks, allocate_gpu_memory) {} + : int_bit_extract_luts_buffer(streams, gpu_indexes, gpu_count, params, + bits_per_block, 0, num_radix_blocks, + allocate_gpu_memory) {} - void release(cudaStream_t stream, uint32_t gpu_index) { - lut->release(stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + lut->release(streams, gpu_indexes, gpu_count); + delete (lut); } }; @@ -689,22 +739,20 @@ template struct int_shift_and_rotate_buffer { Torus *tmp_bits; Torus *tmp_shift_bits; - Torus *tmp_rotated; Torus *tmp_input_bits_a; Torus *tmp_input_bits_b; + Torus *tmp_mux_inputs; int_bit_extract_luts_buffer *bit_extract_luts; int_bit_extract_luts_buffer *bit_extract_luts_with_offset_2; - int_radix_lut *mux_lut; - Torus *tmp_mux_inputs; + int_radix_lut *cleaning_lut; Torus offset; - int_radix_lut *cleaning_lut; - - int_shift_and_rotate_buffer(cudaStream_t stream, uint32_t gpu_index, + int_shift_and_rotate_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, int_radix_params params, uint32_t num_radix_blocks, @@ -728,45 +776,46 @@ template struct int_shift_and_rotate_buffer { offset = (shift_type == LEFT_SHIFT ? 0 : total_nb_bits); bit_extract_luts = new int_bit_extract_luts_buffer( - stream, gpu_index, params, bits_per_block, num_radix_blocks, - allocate_gpu_memory); + streams, gpu_indexes, gpu_count, params, bits_per_block, + num_radix_blocks, allocate_gpu_memory); bit_extract_luts_with_offset_2 = new int_bit_extract_luts_buffer( - stream, gpu_index, params, bits_per_block, 2, num_radix_blocks, - allocate_gpu_memory); + streams, gpu_indexes, gpu_count, params, bits_per_block, 2, + num_radix_blocks, allocate_gpu_memory); - mux_lut = new int_radix_lut(stream, gpu_index, params, 1, - bits_per_block * num_radix_blocks, + mux_lut = new int_radix_lut(streams, gpu_indexes, gpu_count, params, + 1, bits_per_block * num_radix_blocks, allocate_gpu_memory); - cleaning_lut = new int_radix_lut( - stream, gpu_index, params, 1, num_radix_blocks, allocate_gpu_memory); + cleaning_lut = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); if (allocate_gpu_memory) { tmp_bits = (Torus *)cuda_malloc_async(bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) * sizeof(Torus), - stream, gpu_index); + streams[0], gpu_indexes[0]); tmp_shift_bits = (Torus *)cuda_malloc_async( max_num_bits_that_tell_shift * num_radix_blocks * (params.big_lwe_dimension + 1) * sizeof(Torus), - stream, gpu_index); + streams[0], gpu_indexes[0]); tmp_rotated = (Torus *)cuda_malloc_async( bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) * sizeof(Torus), - stream, gpu_index); + streams[0], gpu_indexes[0]); tmp_input_bits_a = (Torus *)cuda_malloc_async( bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) * sizeof(Torus), - stream, gpu_index); + streams[0], gpu_indexes[0]); tmp_input_bits_b = (Torus *)cuda_malloc_async( bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) * sizeof(Torus), - stream, gpu_index); + streams[0], gpu_indexes[0]); tmp_mux_inputs = (Torus *)cuda_malloc_async( bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) * sizeof(Torus), - stream, gpu_index); + streams[0], gpu_indexes[0]); auto mux_lut_f = [](Torus x) -> Torus { // x is expected to be x = 0bcba @@ -787,30 +836,36 @@ template struct int_shift_and_rotate_buffer { }; generate_device_accumulator( - stream, gpu_index, mux_lut->get_lut(0), params.glwe_dimension, - params.polynomial_size, params.message_modulus, params.carry_modulus, - mux_lut_f); + streams[0], gpu_indexes[0], mux_lut->get_lut(0), + params.glwe_dimension, params.polynomial_size, params.message_modulus, + params.carry_modulus, mux_lut_f); auto cleaning_lut_f = [](Torus x) -> Torus { return x; }; generate_device_accumulator( - stream, gpu_index, cleaning_lut->lut, params.glwe_dimension, + streams[0], gpu_indexes[0], cleaning_lut->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, cleaning_lut_f); } } - void release(cudaStream_t stream, uint32_t gpu_index) { - cuda_drop_async(tmp_bits, stream, gpu_index); - cuda_drop_async(tmp_shift_bits, stream, gpu_index); - cuda_drop_async(tmp_rotated, stream, gpu_index); - cuda_drop_async(tmp_input_bits_a, stream, gpu_index); - cuda_drop_async(tmp_input_bits_b, stream, gpu_index); - cuda_drop_async(tmp_mux_inputs, stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(tmp_bits, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_shift_bits, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_rotated, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_input_bits_a, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_input_bits_b, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_mux_inputs, streams[0], gpu_indexes[0]); - bit_extract_luts->release(stream, gpu_index); - bit_extract_luts_with_offset_2->release(stream, gpu_index); - mux_lut->release(stream, gpu_index); - cleaning_lut->release(stream, gpu_index); + bit_extract_luts->release(streams, gpu_indexes, gpu_count); + bit_extract_luts_with_offset_2->release(streams, gpu_indexes, gpu_count); + mux_lut->release(streams, gpu_indexes, gpu_count); + cleaning_lut->release(streams, gpu_indexes, gpu_count); + + delete (bit_extract_luts); + delete (bit_extract_luts_with_offset_2); + delete (mux_lut); + delete (cleaning_lut); } }; @@ -838,9 +893,9 @@ template struct int_sc_prop_memory { int_radix_params params; - int_sc_prop_memory(cudaStream_t stream, uint32_t gpu_index, - int_radix_params params, uint32_t num_radix_blocks, - bool allocate_gpu_memory) { + int_sc_prop_memory(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params params, + uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; @@ -851,9 +906,9 @@ template struct int_sc_prop_memory { // allocate memory for intermediate calculations generates_or_propagates = (Torus *)cuda_malloc_async( - num_radix_blocks * big_lwe_size_bytes, stream, gpu_index); + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); step_output = (Torus *)cuda_malloc_async( - num_radix_blocks * big_lwe_size_bytes, stream, gpu_index); + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); // declare functions for lut generation auto f_lut_does_block_generate_carry = [message_modulus](Torus x) -> Torus { @@ -882,46 +937,50 @@ template struct int_sc_prop_memory { }; // create lut objects - luts_array = new int_radix_lut( - stream, gpu_index, params, 2, num_radix_blocks, allocate_gpu_memory); - luts_carry_propagation_sum = new int_radix_lut( - stream, gpu_index, params, 1, num_radix_blocks, luts_array); - message_acc = new int_radix_lut(stream, gpu_index, params, 1, - num_radix_blocks, luts_array); + luts_array = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 2, + num_radix_blocks, allocate_gpu_memory); + luts_carry_propagation_sum = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, luts_array); + message_acc = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, luts_array); auto lut_does_block_generate_carry = luts_array->get_lut(0); auto lut_does_block_generate_or_propagate = luts_array->get_lut(1); // generate luts (aka accumulators) generate_device_accumulator( - stream, gpu_index, lut_does_block_generate_carry, glwe_dimension, - polynomial_size, message_modulus, carry_modulus, + streams[0], gpu_indexes[0], lut_does_block_generate_carry, + glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_lut_does_block_generate_carry); generate_device_accumulator( - stream, gpu_index, lut_does_block_generate_or_propagate, glwe_dimension, - polynomial_size, message_modulus, carry_modulus, + streams[0], gpu_indexes[0], lut_does_block_generate_or_propagate, + glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate); - cuda_set_value_async(stream, gpu_index, + cuda_set_value_async(streams[0], gpu_indexes[0], luts_array->get_lut_indexes(1), 1, num_radix_blocks - 1); generate_device_accumulator_bivariate( - stream, gpu_index, luts_carry_propagation_sum->lut, glwe_dimension, - polynomial_size, message_modulus, carry_modulus, + streams[0], gpu_indexes[0], luts_carry_propagation_sum->lut, + glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_luts_carry_propagation_sum); generate_device_accumulator( - stream, gpu_index, message_acc->lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, f_message_acc); + streams[0], gpu_indexes[0], message_acc->lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_message_acc); } - void release(cudaStream_t stream, uint32_t gpu_index) { - cuda_drop_async(generates_or_propagates, stream, gpu_index); - cuda_drop_async(step_output, stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(generates_or_propagates, streams[0], gpu_indexes[0]); + cuda_drop_async(step_output, streams[0], gpu_indexes[0]); - luts_array->release(stream, gpu_index); - luts_carry_propagation_sum->release(stream, gpu_index); - message_acc->release(stream, gpu_index); + luts_array->release(streams, gpu_indexes, gpu_count); + luts_carry_propagation_sum->release(streams, gpu_indexes, gpu_count); + message_acc->release(streams, gpu_indexes, gpu_count); delete luts_array; delete luts_carry_propagation_sum; @@ -941,8 +1000,8 @@ template struct int_single_borrow_prop_memory { int_radix_params params; - int_single_borrow_prop_memory(cudaStream_t stream, uint32_t gpu_index, - int_radix_params params, + int_single_borrow_prop_memory(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; @@ -955,9 +1014,9 @@ template struct int_single_borrow_prop_memory { // allocate memory for intermediate calculations generates_or_propagates = (Torus *)cuda_malloc_async( - num_radix_blocks * big_lwe_size_bytes, stream, gpu_index); + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); step_output = (Torus *)cuda_malloc_async( - num_radix_blocks * big_lwe_size_bytes, stream, gpu_index); + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); // declare functions for lut generation auto f_lut_does_block_generate_carry = [message_modulus](Torus x) -> Torus { @@ -986,46 +1045,50 @@ template struct int_single_borrow_prop_memory { }; // create lut objects - luts_array = new int_radix_lut( - stream, gpu_index, params, 2, num_radix_blocks, allocate_gpu_memory); - luts_borrow_propagation_sum = new int_radix_lut( - stream, gpu_index, params, 1, num_radix_blocks, luts_array); - message_acc = new int_radix_lut(stream, gpu_index, params, 1, - num_radix_blocks, luts_array); + luts_array = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 2, + num_radix_blocks, allocate_gpu_memory); + luts_borrow_propagation_sum = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, luts_array); + message_acc = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, luts_array); auto lut_does_block_generate_carry = luts_array->get_lut(0); auto lut_does_block_generate_or_propagate = luts_array->get_lut(1); // generate luts (aka accumulators) generate_device_accumulator( - stream, gpu_index, lut_does_block_generate_carry, glwe_dimension, - polynomial_size, message_modulus, carry_modulus, + streams[0], gpu_indexes[0], lut_does_block_generate_carry, + glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_lut_does_block_generate_carry); generate_device_accumulator( - stream, gpu_index, lut_does_block_generate_or_propagate, glwe_dimension, - polynomial_size, message_modulus, carry_modulus, + streams[0], gpu_indexes[0], lut_does_block_generate_or_propagate, + glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate); - cuda_set_value_async(stream, gpu_index, + cuda_set_value_async(streams[0], gpu_indexes[0], luts_array->get_lut_indexes(1), 1, num_radix_blocks - 1); generate_device_accumulator_bivariate( - stream, gpu_index, luts_borrow_propagation_sum->lut, glwe_dimension, - polynomial_size, message_modulus, carry_modulus, + streams[0], gpu_indexes[0], luts_borrow_propagation_sum->lut, + glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_luts_borrow_propagation_sum); generate_device_accumulator( - stream, gpu_index, message_acc->lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, f_message_acc); + streams[0], gpu_indexes[0], message_acc->lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_message_acc); } - void release(cudaStream_t stream, uint32_t gpu_index) { - cuda_drop_async(generates_or_propagates, stream, gpu_index); - cuda_drop_async(step_output, stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(generates_or_propagates, streams[0], gpu_indexes[0]); + cuda_drop_async(step_output, streams[0], gpu_indexes[0]); - luts_array->release(stream, gpu_index); - luts_borrow_propagation_sum->release(stream, gpu_index); - message_acc->release(stream, gpu_index); + luts_array->release(streams, gpu_indexes, gpu_count); + luts_borrow_propagation_sum->release(streams, gpu_indexes, gpu_count); + message_acc->release(streams, gpu_indexes, gpu_count); delete luts_array; delete luts_borrow_propagation_sum; @@ -1045,37 +1108,38 @@ template struct int_sum_ciphertexts_vec_memory { bool mem_reuse = false; - int_sum_ciphertexts_vec_memory(cudaStream_t stream, uint32_t gpu_index, - int_radix_params params, + int_sum_ciphertexts_vec_memory(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec, bool allocate_gpu_memory) { this->params = params; // create single carry propagation memory object - scp_mem = new int_sc_prop_memory( - stream, gpu_index, params, num_blocks_in_radix, allocate_gpu_memory); + scp_mem = + new int_sc_prop_memory(streams, gpu_indexes, gpu_count, params, + num_blocks_in_radix, allocate_gpu_memory); int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec; // allocate gpu memory for intermediate buffers new_blocks = (Torus *)cuda_malloc_async( - max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), stream, - gpu_index); + max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), + streams[0], gpu_indexes[0]); old_blocks = (Torus *)cuda_malloc_async( - max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), stream, - gpu_index); + max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), + streams[0], gpu_indexes[0]); small_lwe_vector = (Torus *)cuda_malloc_async( max_pbs_count * (params.small_lwe_dimension + 1) * sizeof(Torus), - stream, gpu_index); + streams[0], gpu_indexes[0]); d_smart_copy_in = (int32_t *)cuda_malloc_async( - max_pbs_count * sizeof(int32_t), stream, gpu_index); + max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]); d_smart_copy_out = (int32_t *)cuda_malloc_async( - max_pbs_count * sizeof(int32_t), stream, gpu_index); + max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]); } - int_sum_ciphertexts_vec_memory(cudaStream_t stream, uint32_t gpu_index, - int_radix_params params, + int_sum_ciphertexts_vec_memory(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec, Torus *new_blocks, Torus *old_blocks, @@ -1084,8 +1148,8 @@ template struct int_sum_ciphertexts_vec_memory { this->params = params; // create single carry propagation memory object - scp_mem = new int_sc_prop_memory(stream, gpu_index, params, - num_blocks_in_radix, true); + scp_mem = new int_sc_prop_memory(streams, gpu_indexes, gpu_count, + params, num_blocks_in_radix, true); int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec; // assign gpu memory for intermediate buffers @@ -1094,22 +1158,23 @@ template struct int_sum_ciphertexts_vec_memory { this->small_lwe_vector = small_lwe_vector; d_smart_copy_in = (int32_t *)cuda_malloc_async( - max_pbs_count * sizeof(int32_t), stream, gpu_index); + max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]); d_smart_copy_out = (int32_t *)cuda_malloc_async( - max_pbs_count * sizeof(int32_t), stream, gpu_index); + max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]); } - void release(cudaStream_t stream, uint32_t gpu_index) { - cuda_drop_async(d_smart_copy_in, stream, gpu_index); - cuda_drop_async(d_smart_copy_out, stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(d_smart_copy_in, streams[0], gpu_indexes[0]); + cuda_drop_async(d_smart_copy_out, streams[0], gpu_indexes[0]); if (!mem_reuse) { - cuda_drop_async(new_blocks, stream, gpu_index); - cuda_drop_async(old_blocks, stream, gpu_index); - cuda_drop_async(small_lwe_vector, stream, gpu_index); + cuda_drop_async(new_blocks, streams[0], gpu_indexes[0]); + cuda_drop_async(old_blocks, streams[0], gpu_indexes[0]); + cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]); } - scp_mem->release(stream, gpu_index); + scp_mem->release(streams, gpu_indexes, gpu_count); delete scp_mem; } @@ -1117,24 +1182,25 @@ template struct int_sum_ciphertexts_vec_memory { template struct int_overflowing_sub_memory { int_radix_params params; - bool mem_reuse = false; int_radix_lut *luts_message_carry; int_single_borrow_prop_memory *borrow_prop_mem; - int_overflowing_sub_memory(cudaStream_t stream, uint32_t gpu_index, - int_radix_params params, uint32_t num_blocks, - bool allocate_gpu_memory) { + int_overflowing_sub_memory(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params params, + uint32_t num_blocks, bool allocate_gpu_memory) { this->params = params; auto message_modulus = params.message_modulus; auto carry_modulus = params.carry_modulus; borrow_prop_mem = new int_single_borrow_prop_memory( - stream, gpu_index, params, num_blocks, allocate_gpu_memory); + streams, gpu_indexes, gpu_count, params, num_blocks, + allocate_gpu_memory); int max_pbs_count = num_blocks * 2; // create lut object for message and carry - luts_message_carry = new int_radix_lut( - stream, gpu_index, params, 2, max_pbs_count, allocate_gpu_memory); + luts_message_carry = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 2, + max_pbs_count, allocate_gpu_memory); auto message_acc = luts_message_carry->get_lut(0); auto carry_acc = luts_message_carry->get_lut(1); @@ -1149,15 +1215,16 @@ template struct int_overflowing_sub_memory { // generate accumulators generate_device_accumulator( - stream, gpu_index, message_acc, params.glwe_dimension, + streams[0], gpu_indexes[0], message_acc, params.glwe_dimension, params.polynomial_size, message_modulus, carry_modulus, lut_f_message); generate_device_accumulator( - stream, gpu_index, carry_acc, params.glwe_dimension, + streams[0], gpu_indexes[0], carry_acc, params.glwe_dimension, params.polynomial_size, message_modulus, carry_modulus, lut_f_carry); } - void release(cudaStream_t stream, uint32_t gpu_index) { - luts_message_carry->release(stream, gpu_index); - borrow_prop_mem->release(stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + luts_message_carry->release(streams, gpu_indexes, gpu_count); + borrow_prop_mem->release(streams, gpu_indexes, gpu_count); delete luts_message_carry; delete borrow_prop_mem; @@ -1174,9 +1241,9 @@ template struct int_mul_memory { int_radix_params params; - int_mul_memory(cudaStream_t stream, uint32_t gpu_index, - int_radix_params params, uint32_t num_radix_blocks, - bool allocate_gpu_memory) { + int_mul_memory(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params params, + uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; @@ -1198,19 +1265,20 @@ template struct int_mul_memory { vector_result_sb = (Torus *)cuda_malloc_async( 2 * total_block_count * (polynomial_size * glwe_dimension + 1) * sizeof(Torus), - stream, gpu_index); + streams[0], gpu_indexes[0]); block_mul_res = (Torus *)cuda_malloc_async( 2 * total_block_count * (polynomial_size * glwe_dimension + 1) * sizeof(Torus), - stream, gpu_index); + streams[0], gpu_indexes[0]); small_lwe_vector = (Torus *)cuda_malloc_async( - total_block_count * (lwe_dimension + 1) * sizeof(Torus), stream, - gpu_index); + total_block_count * (lwe_dimension + 1) * sizeof(Torus), streams[0], + gpu_indexes[0]); // create int_radix_lut objects for lsb, msb, message, carry // luts_array -> lut = {lsb_acc, msb_acc} - luts_array = new int_radix_lut( - stream, gpu_index, params, 2, total_block_count, allocate_gpu_memory); + luts_array = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 2, + total_block_count, allocate_gpu_memory); auto lsb_acc = luts_array->get_lut(0); auto msb_acc = luts_array->get_lut(1); @@ -1224,10 +1292,10 @@ template struct int_mul_memory { // generate accumulators generate_device_accumulator_bivariate( - stream, gpu_index, lsb_acc, glwe_dimension, polynomial_size, + streams[0], gpu_indexes[0], lsb_acc, glwe_dimension, polynomial_size, message_modulus, carry_modulus, lut_f_lsb); generate_device_accumulator_bivariate( - stream, gpu_index, msb_acc, glwe_dimension, polynomial_size, + streams[0], gpu_indexes[0], msb_acc, glwe_dimension, polynomial_size, message_modulus, carry_modulus, lut_f_msb); // lut_indexes for luts_array should be reinitialized @@ -1235,23 +1303,25 @@ template struct int_mul_memory { // last msb_vector_block_count values should reference to msb_acc // for message and carry default lut_indexes is fine cuda_set_value_async( - stream, gpu_index, luts_array->get_lut_indexes(lsb_vector_block_count), - 1, msb_vector_block_count); + streams[0], gpu_indexes[0], + luts_array->get_lut_indexes(lsb_vector_block_count), 1, + msb_vector_block_count); // create memory object for sum ciphertexts sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory( - stream, gpu_index, params, num_radix_blocks, 2 * num_radix_blocks, - block_mul_res, vector_result_sb, small_lwe_vector); + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + 2 * num_radix_blocks, block_mul_res, vector_result_sb, + small_lwe_vector); } - void release(cudaStream_t stream, uint32_t gpu_index) { - cuda_drop_async(vector_result_sb, stream, gpu_index); - cuda_drop_async(block_mul_res, stream, gpu_index); - cuda_drop_async(small_lwe_vector, stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(vector_result_sb, streams[0], gpu_indexes[0]); + cuda_drop_async(block_mul_res, streams[0], gpu_indexes[0]); + cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]); - luts_array->release(stream, gpu_index); - - sum_ciphertexts_mem->release(stream, gpu_index); + luts_array->release(streams, gpu_indexes, gpu_count); + sum_ciphertexts_mem->release(streams, gpu_indexes, gpu_count); delete luts_array; delete sum_ciphertexts_mem; @@ -1268,7 +1338,8 @@ template struct int_logical_scalar_shift_buffer { bool reuse_memory = false; - int_logical_scalar_shift_buffer(cudaStream_t stream, uint32_t gpu_index, + int_logical_scalar_shift_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, SHIFT_OR_ROTATE_TYPE shift_type, int_radix_params params, uint32_t num_radix_blocks, @@ -1281,12 +1352,13 @@ template struct int_logical_scalar_shift_buffer { uint32_t big_lwe_size = params.big_lwe_dimension + 1; uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus); - tmp_rotated = (Torus *)cuda_malloc_async( - (max_amount_of_pbs + 2) * big_lwe_size_bytes, stream, gpu_index); + tmp_rotated = (Torus *)cuda_malloc_async((max_amount_of_pbs + 2) * + big_lwe_size_bytes, + streams[0], gpu_indexes[0]); cuda_memset_async(tmp_rotated, 0, - (max_amount_of_pbs + 2) * big_lwe_size_bytes, stream, - gpu_index); + (max_amount_of_pbs + 2) * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); uint32_t num_bits_in_block = (uint32_t)std::log2(params.message_modulus); @@ -1305,7 +1377,7 @@ template struct int_logical_scalar_shift_buffer { // circuit it can reuse memory for different shift values for (int s_w_b = 1; s_w_b < num_bits_in_block; s_w_b++) { auto cur_lut_bivariate = - new int_radix_lut(stream, gpu_index, params, 1, + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, num_radix_blocks, allocate_gpu_memory); uint32_t shift_within_block = s_w_b; @@ -1346,16 +1418,17 @@ template struct int_logical_scalar_shift_buffer { // right shift generate_device_accumulator_bivariate( - stream, gpu_index, cur_lut_bivariate->lut, params.glwe_dimension, - params.polynomial_size, params.message_modulus, - params.carry_modulus, shift_lut_f); + streams[0], gpu_indexes[0], cur_lut_bivariate->lut, + params.glwe_dimension, params.polynomial_size, + params.message_modulus, params.carry_modulus, shift_lut_f); lut_buffers_bivariate.push_back(cur_lut_bivariate); } } } - int_logical_scalar_shift_buffer(cudaStream_t stream, uint32_t gpu_index, + int_logical_scalar_shift_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, SHIFT_OR_ROTATE_TYPE shift_type, int_radix_params params, uint32_t num_radix_blocks, @@ -1370,8 +1443,8 @@ template struct int_logical_scalar_shift_buffer { uint32_t big_lwe_size = params.big_lwe_dimension + 1; uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus); cuda_memset_async(tmp_rotated, 0, - (max_amount_of_pbs + 2) * big_lwe_size_bytes, stream, - gpu_index); + (max_amount_of_pbs + 2) * big_lwe_size_bytes, streams[0], + gpu_indexes[0]); if (allocate_gpu_memory) { uint32_t num_bits_in_block = (uint32_t)std::log2(params.message_modulus); @@ -1391,7 +1464,7 @@ template struct int_logical_scalar_shift_buffer { // circuit it can reuse memory for different shift values for (int s_w_b = 1; s_w_b < num_bits_in_block; s_w_b++) { auto cur_lut_bivariate = - new int_radix_lut(stream, gpu_index, params, 1, + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, num_radix_blocks, allocate_gpu_memory); uint32_t shift_within_block = s_w_b; @@ -1432,23 +1505,24 @@ template struct int_logical_scalar_shift_buffer { // right shift generate_device_accumulator_bivariate( - stream, gpu_index, cur_lut_bivariate->lut, params.glwe_dimension, - params.polynomial_size, params.message_modulus, - params.carry_modulus, shift_lut_f); + streams[0], gpu_indexes[0], cur_lut_bivariate->lut, + params.glwe_dimension, params.polynomial_size, + params.message_modulus, params.carry_modulus, shift_lut_f); lut_buffers_bivariate.push_back(cur_lut_bivariate); } } } - void release(cudaStream_t stream, uint32_t gpu_index) { + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { for (auto &buffer : lut_buffers_bivariate) { - buffer->release(stream, gpu_index); + buffer->release(streams, gpu_indexes, gpu_count); delete buffer; } lut_buffers_bivariate.clear(); if (!reuse_memory) - cuda_drop_async(tmp_rotated, stream, gpu_index); + cuda_drop_async(tmp_rotated, streams[0], gpu_indexes[0]); } }; @@ -1464,7 +1538,8 @@ template struct int_arithmetic_scalar_shift_buffer { cudaStream_t local_stream_1; cudaStream_t local_stream_2; - int_arithmetic_scalar_shift_buffer(cudaStream_t stream, uint32_t gpu_index, + int_arithmetic_scalar_shift_buffer(cudaStream_t *streams, + uint32_t *gpu_indexes, uint32_t gpu_count, SHIFT_OR_ROTATE_TYPE shift_type, int_radix_params params, uint32_t num_radix_blocks, @@ -1472,8 +1547,8 @@ template struct int_arithmetic_scalar_shift_buffer { // In the arithmetic shift, a PBS has to be applied to the last rotated // block twice: once to shift it, once to compute the padding block to be // copied onto all blocks to the left of the last rotated block - local_stream_1 = cuda_create_stream(gpu_index); - local_stream_2 = cuda_create_stream(gpu_index); + local_stream_1 = cuda_create_stream(gpu_indexes[0]); + local_stream_2 = cuda_create_stream(gpu_indexes[0]); this->shift_type = shift_type; this->params = params; @@ -1481,12 +1556,13 @@ template struct int_arithmetic_scalar_shift_buffer { uint32_t big_lwe_size = params.big_lwe_dimension + 1; uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus); - tmp_rotated = (Torus *)cuda_malloc_async( - (num_radix_blocks + 2) * big_lwe_size_bytes, stream, gpu_index); + tmp_rotated = (Torus *)cuda_malloc_async((num_radix_blocks + 2) * + big_lwe_size_bytes, + streams[0], gpu_indexes[0]); cuda_memset_async(tmp_rotated, 0, - (num_radix_blocks + 2) * big_lwe_size_bytes, stream, - gpu_index); + (num_radix_blocks + 2) * big_lwe_size_bytes, streams[0], + gpu_indexes[0]); uint32_t num_bits_in_block = (uint32_t)std::log2(params.message_modulus); @@ -1500,7 +1576,7 @@ template struct int_arithmetic_scalar_shift_buffer { // With two bits of message this is actually only one LUT. for (int s_w_b = 1; s_w_b < num_bits_in_block; s_w_b++) { auto shift_last_block_lut_univariate = new int_radix_lut( - stream, gpu_index, params, 1, 1, allocate_gpu_memory); + streams, gpu_indexes, 1, params, 1, 1, allocate_gpu_memory); uint32_t shift_within_block = s_w_b; @@ -1523,7 +1599,7 @@ template struct int_arithmetic_scalar_shift_buffer { }; generate_device_accumulator( - stream, gpu_index, shift_last_block_lut_univariate->lut, + streams[0], gpu_indexes[0], shift_last_block_lut_univariate->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, last_block_lut_f); @@ -1531,7 +1607,7 @@ template struct int_arithmetic_scalar_shift_buffer { } auto padding_block_lut_univariate = new int_radix_lut( - stream, gpu_index, params, 1, 1, allocate_gpu_memory); + streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory); // lut to compute the padding block std::function padding_block_lut_f; @@ -1544,7 +1620,7 @@ template struct int_arithmetic_scalar_shift_buffer { }; generate_device_accumulator( - stream, gpu_index, padding_block_lut_univariate->lut, + streams[0], gpu_indexes[0], padding_block_lut_univariate->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, padding_block_lut_f); @@ -1557,7 +1633,7 @@ template struct int_arithmetic_scalar_shift_buffer { // NB: with two bits of message, this is actually only one LUT. for (int s_w_b = 1; s_w_b < num_bits_in_block; s_w_b++) { auto shift_blocks_lut_bivariate = - new int_radix_lut(stream, gpu_index, params, 1, + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, num_radix_blocks, allocate_gpu_memory); uint32_t shift_within_block = s_w_b; @@ -1582,7 +1658,7 @@ template struct int_arithmetic_scalar_shift_buffer { }; generate_device_accumulator_bivariate( - stream, gpu_index, shift_blocks_lut_bivariate->lut, + streams[0], gpu_indexes[0], shift_blocks_lut_bivariate->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, blocks_lut_f); @@ -1591,21 +1667,22 @@ template struct int_arithmetic_scalar_shift_buffer { } } - void release(cudaStream_t stream, uint32_t gpu_index) { - cuda_destroy_stream(local_stream_1, gpu_index); - cuda_destroy_stream(local_stream_2, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + cuda_destroy_stream(local_stream_1, gpu_indexes[0]); + cuda_destroy_stream(local_stream_2, gpu_indexes[0]); for (auto &buffer : lut_buffers_bivariate) { - buffer->release(stream, gpu_index); + buffer->release(streams, gpu_indexes, gpu_count); delete buffer; } for (auto &buffer : lut_buffers_univariate) { - buffer->release(stream, gpu_index); + buffer->release(streams, gpu_indexes, 1); delete buffer; } lut_buffers_bivariate.clear(); lut_buffers_univariate.clear(); - cuda_drop_async(tmp_rotated, stream, gpu_index); + cuda_drop_async(tmp_rotated, streams[0], gpu_indexes[0]); } }; @@ -1617,23 +1694,23 @@ template struct int_zero_out_if_buffer { cudaStream_t local_stream; - int_zero_out_if_buffer(cudaStream_t stream, uint32_t gpu_index, - int_radix_params params, uint32_t num_radix_blocks, - bool allocate_gpu_memory) { + int_zero_out_if_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params params, + uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; Torus big_size = (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); if (allocate_gpu_memory) { - - tmp = (Torus *)cuda_malloc_async(big_size, stream, gpu_index); + tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]); // We may use a different stream to allow concurrent operation - local_stream = cuda_create_stream(gpu_index); + local_stream = cuda_create_stream(gpu_indexes[0]); } } - void release(cudaStream_t stream, uint32_t gpu_index) { - cuda_drop_async(tmp, stream, gpu_index); - cuda_destroy_stream(local_stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(tmp, streams[0], gpu_indexes[0]); + cuda_destroy_stream(local_stream, gpu_indexes[0]); } }; @@ -1650,7 +1727,8 @@ template struct int_cmux_buffer { int_radix_params params; - int_cmux_buffer(cudaStream_t stream, uint32_t gpu_index, + int_cmux_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, std::function predicate_lut_f, int_radix_params params, uint32_t num_radix_blocks, bool allocate_gpu_memory) { @@ -1661,13 +1739,17 @@ template struct int_cmux_buffer { Torus big_size = (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); - tmp_true_ct = (Torus *)cuda_malloc_async(big_size, stream, gpu_index); - tmp_false_ct = (Torus *)cuda_malloc_async(big_size, stream, gpu_index); + tmp_true_ct = + (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]); + tmp_false_ct = + (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]); zero_if_true_buffer = new int_zero_out_if_buffer( - stream, gpu_index, params, num_radix_blocks, allocate_gpu_memory); + streams, gpu_indexes, 1, params, num_radix_blocks, + allocate_gpu_memory); zero_if_false_buffer = new int_zero_out_if_buffer( - stream, gpu_index, params, num_radix_blocks, allocate_gpu_memory); + streams, gpu_indexes, 1, params, num_radix_blocks, + allocate_gpu_memory); auto lut_f = [predicate_lut_f](Torus block, Torus condition) -> Torus { return predicate_lut_f(condition) ? 0 : block; @@ -1680,47 +1762,51 @@ template struct int_cmux_buffer { return x % params.message_modulus; }; - predicate_lut = new int_radix_lut( - stream, gpu_index, params, 1, num_radix_blocks, allocate_gpu_memory); + predicate_lut = + new int_radix_lut(streams, gpu_indexes, 1, params, 1, + num_radix_blocks, allocate_gpu_memory); - inverted_predicate_lut = new int_radix_lut( - stream, gpu_index, params, 1, num_radix_blocks, allocate_gpu_memory); + inverted_predicate_lut = + new int_radix_lut(streams, gpu_indexes, 1, params, 1, + num_radix_blocks, allocate_gpu_memory); - message_extract_lut = new int_radix_lut( - stream, gpu_index, params, 1, num_radix_blocks, allocate_gpu_memory); + message_extract_lut = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); generate_device_accumulator_bivariate( - stream, gpu_index, predicate_lut->lut, params.glwe_dimension, + streams[0], gpu_indexes[0], predicate_lut->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, lut_f); generate_device_accumulator_bivariate( - stream, gpu_index, inverted_predicate_lut->lut, params.glwe_dimension, - params.polynomial_size, params.message_modulus, params.carry_modulus, - inverted_lut_f); + streams[0], gpu_indexes[0], inverted_predicate_lut->lut, + params.glwe_dimension, params.polynomial_size, params.message_modulus, + params.carry_modulus, inverted_lut_f); generate_device_accumulator( - stream, gpu_index, message_extract_lut->lut, params.glwe_dimension, - params.polynomial_size, params.message_modulus, params.carry_modulus, - message_extract_lut_f); + streams[0], gpu_indexes[0], message_extract_lut->lut, + params.glwe_dimension, params.polynomial_size, params.message_modulus, + params.carry_modulus, message_extract_lut_f); } } - void release(cudaStream_t stream, uint32_t gpu_index) { - predicate_lut->release(stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + predicate_lut->release(streams, gpu_indexes, 1); delete predicate_lut; - inverted_predicate_lut->release(stream, gpu_index); + inverted_predicate_lut->release(streams, gpu_indexes, 1); delete inverted_predicate_lut; - message_extract_lut->release(stream, gpu_index); + message_extract_lut->release(streams, gpu_indexes, gpu_count); delete message_extract_lut; - zero_if_true_buffer->release(stream, gpu_index); + zero_if_true_buffer->release(streams, gpu_indexes, 1); delete zero_if_true_buffer; - zero_if_false_buffer->release(stream, gpu_index); + zero_if_false_buffer->release(streams, gpu_indexes, 1); delete zero_if_false_buffer; - cuda_drop_async(tmp_true_ct, stream, gpu_index); - cuda_drop_async(tmp_false_ct, stream, gpu_index); + cuda_drop_async(tmp_true_ct, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_false_ct, streams[0], gpu_indexes[0]); } }; @@ -1729,16 +1815,16 @@ template struct int_are_all_block_true_buffer { int_radix_params params; Torus *tmp_out; + Torus *tmp_block_accumulated; // This map store LUTs that checks the equality between some input and values // of interest in are_all_block_true(), as with max_value (the maximum message // value). std::unordered_map *> is_equal_to_lut_map; - Torus *tmp_block_accumulated; - - int_are_all_block_true_buffer(cudaStream_t stream, uint32_t gpu_index, - COMPARISON_TYPE op, int_radix_params params, + int_are_all_block_true_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, COMPARISON_TYPE op, + int_radix_params params, uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; @@ -1750,22 +1836,23 @@ template struct int_are_all_block_true_buffer { int max_chunks = (num_radix_blocks + max_value - 1) / max_value; tmp_block_accumulated = (Torus *)cuda_malloc_async( - (params.big_lwe_dimension + 1) * max_chunks * sizeof(Torus), stream, - gpu_index); + (params.big_lwe_dimension + 1) * max_chunks * sizeof(Torus), + streams[0], gpu_indexes[0]); tmp_out = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus), - stream, gpu_index); + streams[0], gpu_indexes[0]); } } - void release(cudaStream_t stream, uint32_t gpu_index) { + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { for (auto &lut : is_equal_to_lut_map) { - lut.second->release(stream, gpu_index); + lut.second->release(streams, gpu_indexes, 1); } is_equal_to_lut_map.clear(); - cuda_drop_async(tmp_block_accumulated, stream, gpu_index); - cuda_drop_async(tmp_out, stream, gpu_index); + cuda_drop_async(tmp_block_accumulated, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_out, streams[0], gpu_indexes[0]); } }; @@ -1775,14 +1862,13 @@ template struct int_comparison_eq_buffer { int_radix_lut *operator_lut; int_radix_lut *is_non_zero_lut; + int_radix_lut *scalar_comparison_luts; int_are_all_block_true_buffer *are_all_block_true_buffer; - int_radix_lut *scalar_comparison_luts; - - int_comparison_eq_buffer(cudaStream_t stream, uint32_t gpu_index, - COMPARISON_TYPE op, int_radix_params params, - uint32_t num_radix_blocks, + int_comparison_eq_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, COMPARISON_TYPE op, + int_radix_params params, uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; this->op = op; @@ -1790,7 +1876,8 @@ template struct int_comparison_eq_buffer { if (allocate_gpu_memory) { are_all_block_true_buffer = new int_are_all_block_true_buffer( - stream, gpu_index, op, params, num_radix_blocks, allocate_gpu_memory); + streams, gpu_indexes, gpu_count, op, params, num_radix_blocks, + allocate_gpu_memory); // Operator LUT auto operator_f = [op](Torus lhs, Torus rhs) -> Torus { @@ -1802,11 +1889,12 @@ template struct int_comparison_eq_buffer { return (lhs != rhs); } }; - operator_lut = new int_radix_lut( - stream, gpu_index, params, 1, num_radix_blocks, allocate_gpu_memory); + operator_lut = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); generate_device_accumulator_bivariate( - stream, gpu_index, operator_lut->lut, params.glwe_dimension, + streams[0], gpu_indexes[0], operator_lut->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, operator_f); @@ -1816,18 +1904,19 @@ template struct int_comparison_eq_buffer { return (x % total_modulus) != 0; }; - is_non_zero_lut = new int_radix_lut( - stream, gpu_index, params, 1, num_radix_blocks, allocate_gpu_memory); + is_non_zero_lut = + new int_radix_lut(streams, gpu_indexes, 1, params, 1, + num_radix_blocks, allocate_gpu_memory); generate_device_accumulator( - stream, gpu_index, is_non_zero_lut->lut, params.glwe_dimension, - params.polynomial_size, params.message_modulus, params.carry_modulus, - is_non_zero_lut_f); + streams[0], gpu_indexes[0], is_non_zero_lut->lut, + params.glwe_dimension, params.polynomial_size, params.message_modulus, + params.carry_modulus, is_non_zero_lut_f); // Scalar may have up to num_radix_blocks blocks - scalar_comparison_luts = - new int_radix_lut(stream, gpu_index, params, total_modulus, - num_radix_blocks, allocate_gpu_memory); + scalar_comparison_luts = new int_radix_lut( + streams, gpu_indexes, 1, params, total_modulus, num_radix_blocks, + allocate_gpu_memory); for (int i = 0; i < total_modulus; i++) { auto lut_f = [i, operator_f](Torus x) -> Torus { @@ -1837,24 +1926,23 @@ template struct int_comparison_eq_buffer { Torus *lut = scalar_comparison_luts->lut + i * (params.glwe_dimension + 1) * params.polynomial_size; generate_device_accumulator( - stream, gpu_index, lut, params.glwe_dimension, + streams[0], gpu_indexes[0], lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, lut_f); } } } - void release(cudaStream_t stream, uint32_t gpu_index) { - operator_lut->release(stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + operator_lut->release(streams, gpu_indexes, gpu_count); delete operator_lut; - is_non_zero_lut->release(stream, gpu_index); + is_non_zero_lut->release(streams, gpu_indexes, 1); delete is_non_zero_lut; - - are_all_block_true_buffer->release(stream, gpu_index); - delete are_all_block_true_buffer; - - scalar_comparison_luts->release(stream, gpu_index); + scalar_comparison_luts->release(streams, gpu_indexes, gpu_count); delete scalar_comparison_luts; + are_all_block_true_buffer->release(streams, gpu_indexes, gpu_count); + delete are_all_block_true_buffer; } }; @@ -1871,7 +1959,8 @@ template struct int_tree_sign_reduction_buffer { Torus *tmp_x; Torus *tmp_y; - int_tree_sign_reduction_buffer(cudaStream_t stream, uint32_t gpu_index, + int_tree_sign_reduction_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, std::function operator_f, int_radix_params params, uint32_t num_radix_blocks, @@ -1888,36 +1977,40 @@ template struct int_tree_sign_reduction_buffer { }; if (allocate_gpu_memory) { - tmp_x = (Torus *)cuda_malloc_async(big_size * num_radix_blocks, stream, - gpu_index); - tmp_y = (Torus *)cuda_malloc_async(big_size * num_radix_blocks, stream, - gpu_index); + tmp_x = (Torus *)cuda_malloc_async(big_size * num_radix_blocks, + streams[0], gpu_indexes[0]); + tmp_y = (Torus *)cuda_malloc_async(big_size * num_radix_blocks, + streams[0], gpu_indexes[0]); // LUTs - tree_inner_leaf_lut = new int_radix_lut( - stream, gpu_index, params, 1, num_radix_blocks, allocate_gpu_memory); + tree_inner_leaf_lut = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); - tree_last_leaf_lut = new int_radix_lut( - stream, gpu_index, params, 1, num_radix_blocks, allocate_gpu_memory); + tree_last_leaf_lut = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); - tree_last_leaf_scalar_lut = new int_radix_lut( - stream, gpu_index, params, 1, num_radix_blocks, allocate_gpu_memory); + tree_last_leaf_scalar_lut = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); generate_device_accumulator_bivariate( - stream, gpu_index, tree_inner_leaf_lut->lut, params.glwe_dimension, - params.polynomial_size, params.message_modulus, params.carry_modulus, - block_selector_f); + streams[0], gpu_indexes[0], tree_inner_leaf_lut->lut, + params.glwe_dimension, params.polynomial_size, params.message_modulus, + params.carry_modulus, block_selector_f); } } - void release(cudaStream_t stream, uint32_t gpu_index) { - tree_inner_leaf_lut->release(stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + tree_inner_leaf_lut->release(streams, gpu_indexes, gpu_count); delete tree_inner_leaf_lut; - tree_last_leaf_lut->release(stream, gpu_index); + tree_last_leaf_lut->release(streams, gpu_indexes, gpu_count); delete tree_last_leaf_lut; - tree_last_leaf_scalar_lut->release(stream, gpu_index); + tree_last_leaf_scalar_lut->release(streams, gpu_indexes, gpu_count); delete tree_last_leaf_scalar_lut; - cuda_drop_async(tmp_x, stream, gpu_index); - cuda_drop_async(tmp_y, stream, gpu_index); + cuda_drop_async(tmp_x, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_y, streams[0], gpu_indexes[0]); } }; @@ -1936,9 +2029,9 @@ template struct int_comparison_diff_buffer { Torus *tmp_signs_b; int_radix_lut *reduce_signs_lut; - int_comparison_diff_buffer(cudaStream_t stream, uint32_t gpu_index, - COMPARISON_TYPE op, int_radix_params params, - uint32_t num_radix_blocks, + int_comparison_diff_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, COMPARISON_TYPE op, + int_radix_params params, uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; this->op = op; @@ -1963,34 +2056,36 @@ template struct int_comparison_diff_buffer { Torus big_size = (params.big_lwe_dimension + 1) * sizeof(Torus); tmp_packed_left = (Torus *)cuda_malloc_async( - big_size * (num_radix_blocks / 2), stream, gpu_index); + big_size * (num_radix_blocks / 2), streams[0], gpu_indexes[0]); tmp_packed_right = (Torus *)cuda_malloc_async( - big_size * (num_radix_blocks / 2), stream, gpu_index); + big_size * (num_radix_blocks / 2), streams[0], gpu_indexes[0]); tree_buffer = new int_tree_sign_reduction_buffer( - stream, gpu_index, operator_f, params, num_radix_blocks, + streams, gpu_indexes, gpu_count, operator_f, params, num_radix_blocks, allocate_gpu_memory); tmp_signs_a = (Torus *)cuda_malloc_async(big_size * num_radix_blocks, - stream, gpu_index); + streams[0], gpu_indexes[0]); tmp_signs_b = (Torus *)cuda_malloc_async(big_size * num_radix_blocks, - stream, gpu_index); + streams[0], gpu_indexes[0]); // LUTs - reduce_signs_lut = new int_radix_lut( - stream, gpu_index, params, 1, num_radix_blocks, allocate_gpu_memory); + reduce_signs_lut = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); } } - void release(cudaStream_t stream, uint32_t gpu_index) { - tree_buffer->release(stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + tree_buffer->release(streams, gpu_indexes, gpu_count); delete tree_buffer; - reduce_signs_lut->release(stream, gpu_index); + reduce_signs_lut->release(streams, gpu_indexes, gpu_count); delete reduce_signs_lut; - cuda_drop_async(tmp_packed_left, stream, gpu_index); - cuda_drop_async(tmp_packed_right, stream, gpu_index); - cuda_drop_async(tmp_signs_a, stream, gpu_index); - cuda_drop_async(tmp_signs_b, stream, gpu_index); + cuda_drop_async(tmp_packed_left, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_packed_right, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_signs_a, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_signs_b, streams[0], gpu_indexes[0]); } }; @@ -2027,10 +2122,10 @@ template struct int_comparison_buffer { cudaStream_t lsb_stream; cudaStream_t msb_stream; - int_comparison_buffer(cudaStream_t stream, uint32_t gpu_index, - COMPARISON_TYPE op, int_radix_params params, - uint32_t num_radix_blocks, bool is_signed, - bool allocate_gpu_memory) { + int_comparison_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, COMPARISON_TYPE op, + int_radix_params params, uint32_t num_radix_blocks, + bool is_signed, bool allocate_gpu_memory) { this->params = params; this->op = op; this->is_signed = is_signed; @@ -2040,28 +2135,30 @@ template struct int_comparison_buffer { auto big_lwe_size = params.big_lwe_dimension + 1; if (allocate_gpu_memory) { - lsb_stream = cuda_create_stream(gpu_index); - msb_stream = cuda_create_stream(gpu_index); + lsb_stream = cuda_create_stream(gpu_indexes[0]); + msb_stream = cuda_create_stream(gpu_indexes[0]); // +1 to have space for signed comparison tmp_lwe_array_out = (Torus *)cuda_malloc_async( - big_lwe_size * (num_radix_blocks + 1) * sizeof(Torus), stream, - gpu_index); + big_lwe_size * (num_radix_blocks + 1) * sizeof(Torus), streams[0], + gpu_indexes[0]); tmp_packed_input = (Torus *)cuda_malloc_async( - big_lwe_size * 2 * num_radix_blocks * sizeof(Torus), stream, - gpu_index); + big_lwe_size * 2 * num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); // Block comparisons tmp_block_comparisons = (Torus *)cuda_malloc_async( - big_lwe_size * num_radix_blocks * sizeof(Torus), stream, gpu_index); + big_lwe_size * num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); // Cleaning LUT - identity_lut = new int_radix_lut( - stream, gpu_index, params, 1, num_radix_blocks, allocate_gpu_memory); + identity_lut = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); generate_device_accumulator( - stream, gpu_index, identity_lut->lut, params.glwe_dimension, + streams[0], gpu_indexes[0], identity_lut->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, identity_lut_f); @@ -2070,11 +2167,12 @@ template struct int_comparison_buffer { return (x % total_modulus) == 0; }; - is_zero_lut = new int_radix_lut( - stream, gpu_index, params, 1, num_radix_blocks, allocate_gpu_memory); + is_zero_lut = + new int_radix_lut(streams, gpu_indexes, 1, params, 1, + num_radix_blocks, allocate_gpu_memory); generate_device_accumulator( - stream, gpu_index, is_zero_lut->lut, params.glwe_dimension, + streams[0], gpu_indexes[0], is_zero_lut->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, is_zero_f); @@ -2082,7 +2180,7 @@ template struct int_comparison_buffer { case COMPARISON_TYPE::MAX: case COMPARISON_TYPE::MIN: cmux_buffer = new int_cmux_buffer( - stream, gpu_index, + streams, gpu_indexes, gpu_count, [op](Torus x) -> Torus { if (op == COMPARISON_TYPE::MAX) return (x == IS_SUPERIOR); @@ -2095,12 +2193,12 @@ template struct int_comparison_buffer { case COMPARISON_TYPE::LT: case COMPARISON_TYPE::LE: diff_buffer = new int_comparison_diff_buffer( - stream, gpu_index, op, params, num_radix_blocks, + streams, gpu_indexes, gpu_count, op, params, num_radix_blocks, allocate_gpu_memory); case COMPARISON_TYPE::EQ: case COMPARISON_TYPE::NE: eq_buffer = new int_comparison_eq_buffer( - stream, gpu_index, op, params, num_radix_blocks, + streams, gpu_indexes, gpu_count, op, params, num_radix_blocks, allocate_gpu_memory); break; default: @@ -2110,12 +2208,12 @@ template struct int_comparison_buffer { if (is_signed) { tmp_trivial_sign_block = (Torus *)cuda_malloc_async( - big_lwe_size * sizeof(Torus), stream, gpu_index); + big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]); - signed_lut = new int_radix_lut(stream, gpu_index, params, 1, 1, - allocate_gpu_memory); - signed_msb_lut = new int_radix_lut(stream, gpu_index, params, 1, - 1, allocate_gpu_memory); + signed_lut = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory); + signed_msb_lut = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory); auto message_modulus = (int)params.message_modulus; uint32_t sign_bit_pos = log2(message_modulus) - 1; @@ -2150,50 +2248,51 @@ template struct int_comparison_buffer { }; generate_device_accumulator_bivariate( - stream, gpu_index, signed_lut->lut, params.glwe_dimension, + streams[0], gpu_indexes[0], signed_lut->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, signed_lut_f); } } } - void release(cudaStream_t stream, uint32_t gpu_index) { + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { switch (op) { case COMPARISON_TYPE::MAX: case COMPARISON_TYPE::MIN: - cmux_buffer->release(stream, gpu_index); + cmux_buffer->release(streams, gpu_indexes, gpu_count); delete (cmux_buffer); case COMPARISON_TYPE::GT: case COMPARISON_TYPE::GE: case COMPARISON_TYPE::LT: case COMPARISON_TYPE::LE: - diff_buffer->release(stream, gpu_index); + diff_buffer->release(streams, gpu_indexes, gpu_count); delete (diff_buffer); case COMPARISON_TYPE::EQ: case COMPARISON_TYPE::NE: - eq_buffer->release(stream, gpu_index); + eq_buffer->release(streams, gpu_indexes, gpu_count); delete (eq_buffer); break; default: PANIC("Unsupported comparison operation.") } - identity_lut->release(stream, gpu_index); + identity_lut->release(streams, gpu_indexes, gpu_count); delete identity_lut; - is_zero_lut->release(stream, gpu_index); + is_zero_lut->release(streams, gpu_indexes, 1); delete is_zero_lut; - cuda_drop_async(tmp_lwe_array_out, stream, gpu_index); - cuda_drop_async(tmp_block_comparisons, stream, gpu_index); - cuda_drop_async(tmp_packed_input, stream, gpu_index); + cuda_drop_async(tmp_lwe_array_out, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_block_comparisons, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_packed_input, streams[0], gpu_indexes[0]); if (is_signed) { - cuda_drop_async(tmp_trivial_sign_block, stream, gpu_index); - signed_lut->release(stream, gpu_index); + cuda_drop_async(tmp_trivial_sign_block, streams[0], gpu_indexes[0]); + signed_lut->release(streams, gpu_indexes, gpu_count); delete (signed_lut); - signed_msb_lut->release(stream, gpu_index); + signed_msb_lut->release(streams, gpu_indexes, gpu_count); delete (signed_msb_lut); } - cuda_destroy_stream(lsb_stream, gpu_index); - cuda_destroy_stream(msb_stream, gpu_index); + cuda_destroy_stream(lsb_stream, gpu_indexes[0]); + cuda_destroy_stream(msb_stream, gpu_indexes[0]); } }; @@ -2243,54 +2342,56 @@ template struct int_div_rem_memory { // allocate and initialize if needed, temporary arrays used to calculate // cuda integer div_rem operation - void init_temporary_buffers(cudaStream_t stream, uint32_t gpu_index, - uint32_t num_blocks) { + void init_temporary_buffers(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, uint32_t num_blocks) { uint32_t big_lwe_size = params.big_lwe_dimension + 1; // non boolean temporary arrays, with `num_blocks` blocks remainder1 = (Torus *)cuda_malloc_async( - big_lwe_size * num_blocks * sizeof(Torus), stream, gpu_index); + big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); remainder2 = (Torus *)cuda_malloc_async( - big_lwe_size * num_blocks * sizeof(Torus), stream, gpu_index); + big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); numerator_block_stack = (Torus *)cuda_malloc_async( - big_lwe_size * num_blocks * sizeof(Torus), stream, gpu_index); + big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); interesting_remainder2 = (Torus *)cuda_malloc_async( - big_lwe_size * num_blocks * sizeof(Torus), stream, gpu_index); + big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); interesting_divisor = (Torus *)cuda_malloc_async( - big_lwe_size * num_blocks * sizeof(Torus), stream, gpu_index); + big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); divisor_ms_blocks = (Torus *)cuda_malloc_async( - big_lwe_size * num_blocks * sizeof(Torus), stream, gpu_index); + big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); new_remainder = (Torus *)cuda_malloc_async( - big_lwe_size * num_blocks * sizeof(Torus), stream, gpu_index); + big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); cleaned_merged_interesting_remainder = (Torus *)cuda_malloc_async( - big_lwe_size * num_blocks * sizeof(Torus), stream, gpu_index); + big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); tmp_1 = (Torus *)cuda_malloc_async( - big_lwe_size * num_blocks * sizeof(Torus), stream, gpu_index); + big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); // temporary arrays used as stacks - tmp_radix = (Torus *)cuda_malloc_async( - big_lwe_size * (num_blocks + 1) * sizeof(Torus), stream, gpu_index); + tmp_radix = (Torus *)cuda_malloc_async(big_lwe_size * (num_blocks + 1) * + sizeof(Torus), + streams[0], gpu_indexes[0]); interesting_remainder1 = (Torus *)cuda_malloc_async( - big_lwe_size * (num_blocks + 1) * sizeof(Torus), stream, gpu_index); + big_lwe_size * (num_blocks + 1) * sizeof(Torus), streams[0], + gpu_indexes[0]); numerator_block_1 = (Torus *)cuda_malloc_async( - big_lwe_size * 2 * sizeof(Torus), stream, gpu_index); + big_lwe_size * 2 * sizeof(Torus), streams[0], gpu_indexes[0]); // temporary arrays for boolean blocks subtraction_overflowed = (Torus *)cuda_malloc_async( - big_lwe_size * 1 * sizeof(Torus), stream, gpu_index); + big_lwe_size * 1 * sizeof(Torus), streams[0], gpu_indexes[0]); did_not_overflow = (Torus *)cuda_malloc_async( - big_lwe_size * 1 * sizeof(Torus), stream, gpu_index); + big_lwe_size * 1 * sizeof(Torus), streams[0], gpu_indexes[0]); overflow_sum = (Torus *)cuda_malloc_async(big_lwe_size * 1 * sizeof(Torus), - stream, gpu_index); + streams[0], gpu_indexes[0]); overflow_sum_radix = (Torus *)cuda_malloc_async( - big_lwe_size * num_blocks * sizeof(Torus), stream, gpu_index); + big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); at_least_one_upper_block_is_non_zero = (Torus *)cuda_malloc_async( - big_lwe_size * 1 * sizeof(Torus), stream, gpu_index); + big_lwe_size * 1 * sizeof(Torus), streams[0], gpu_indexes[0]); } // initialize lookup tables for div_rem operation - void init_lookup_tables(cudaStream_t stream, uint32_t gpu_index, - uint32_t num_blocks) { + void init_lookup_tables(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, uint32_t num_blocks) { uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus); // create and generate masking_luts_1[] and masking_lut_2[] @@ -2303,16 +2404,16 @@ template struct int_div_rem_memory { std::function lut_f_masking = [shifted_mask](Torus x) -> Torus { return x & shifted_mask; }; - masking_luts_1[i] = new int_radix_lut(stream, gpu_index, params, 1, - num_blocks, true); - masking_luts_2[i] = new int_radix_lut(stream, gpu_index, params, 1, - num_blocks, true); + masking_luts_1[i] = new int_radix_lut(streams, gpu_indexes, 1, + params, 1, num_blocks, true); + masking_luts_2[i] = new int_radix_lut(streams, gpu_indexes, 1, + params, 1, num_blocks, true); Torus *luts[2] = {masking_luts_1[i]->lut, masking_luts_2[i]->lut}; for (int j = 0; j < 2; j++) { generate_device_accumulator( - stream, gpu_index, luts[j], params.glwe_dimension, + streams[0], gpu_indexes[0], luts[j], params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, lut_f_masking); } @@ -2321,10 +2422,10 @@ template struct int_div_rem_memory { // create and generate message_extract_lut_1 and message_extract_lut_2 // both of them are equal but because they are used in two different // executions in parallel we need two different pbs_buffers. - message_extract_lut_1 = new int_radix_lut(stream, gpu_index, params, - 1, num_blocks, true); - message_extract_lut_2 = new int_radix_lut(stream, gpu_index, params, - 1, num_blocks, true); + message_extract_lut_1 = new int_radix_lut( + streams, gpu_indexes, 1, params, 1, num_blocks, true); + message_extract_lut_2 = new int_radix_lut( + streams, gpu_indexes, 1, params, 1, num_blocks, true); auto message_modulus = params.message_modulus; auto lut_f_message_extract = [message_modulus](Torus x) -> Torus { @@ -2334,7 +2435,7 @@ template struct int_div_rem_memory { Torus *luts[2] = {message_extract_lut_1->lut, message_extract_lut_2->lut}; for (int j = 0; j < 2; j++) { generate_device_accumulator( - stream, gpu_index, luts[j], params.glwe_dimension, + streams[0], gpu_indexes[0], luts[j], params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, lut_f_message_extract); } @@ -2350,9 +2451,9 @@ template struct int_div_rem_memory { // create and generate zero_out_if_overflow_did_not_happen zero_out_if_overflow_did_not_happen = new int_radix_lut *[2]; zero_out_if_overflow_did_not_happen[0] = new int_radix_lut( - stream, gpu_index, params, 1, num_blocks, true); + streams, gpu_indexes, 1, params, 1, num_blocks, true); zero_out_if_overflow_did_not_happen[1] = new int_radix_lut( - stream, gpu_index, params, 1, num_blocks, true); + streams, gpu_indexes, 1, params, 1, num_blocks, true); auto cur_lut_f = [&](Torus block, Torus overflow_sum) -> Torus { if (overflow_did_not_happen(overflow_sum)) { @@ -2363,20 +2464,20 @@ template struct int_div_rem_memory { }; generate_device_accumulator_bivariate_with_factor( - stream, gpu_index, zero_out_if_overflow_did_not_happen[0]->lut, + streams[0], gpu_indexes[0], zero_out_if_overflow_did_not_happen[0]->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, cur_lut_f, 2); generate_device_accumulator_bivariate_with_factor( - stream, gpu_index, zero_out_if_overflow_did_not_happen[1]->lut, + streams[0], gpu_indexes[0], zero_out_if_overflow_did_not_happen[1]->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, cur_lut_f, 3); // create and generate zero_out_if_overflow_happened zero_out_if_overflow_happened = new int_radix_lut *[2]; zero_out_if_overflow_happened[0] = new int_radix_lut( - stream, gpu_index, params, 1, num_blocks, true); + streams, gpu_indexes, 1, params, 1, num_blocks, true); zero_out_if_overflow_happened[1] = new int_radix_lut( - stream, gpu_index, params, 1, num_blocks, true); + streams, gpu_indexes, 1, params, 1, num_blocks, true); auto overflow_happened_f = [&](Torus block, Torus overflow_sum) -> Torus { if (overflow_happened(overflow_sum)) { @@ -2387,11 +2488,11 @@ template struct int_div_rem_memory { }; generate_device_accumulator_bivariate_with_factor( - stream, gpu_index, zero_out_if_overflow_happened[0]->lut, + streams[0], gpu_indexes[0], zero_out_if_overflow_happened[0]->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, overflow_happened_f, 2); generate_device_accumulator_bivariate_with_factor( - stream, gpu_index, zero_out_if_overflow_happened[1]->lut, + streams[0], gpu_indexes[0], zero_out_if_overflow_happened[1]->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, overflow_happened_f, 3); @@ -2403,51 +2504,52 @@ template struct int_div_rem_memory { }; merge_overflow_flags_luts[i] = new int_radix_lut( - stream, gpu_index, params, 1, num_blocks, true); + streams, gpu_indexes, 1, params, 1, num_blocks, true); generate_device_accumulator_bivariate( - stream, gpu_index, merge_overflow_flags_luts[i]->lut, + streams[0], gpu_indexes[0], merge_overflow_flags_luts[i]->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, lut_f_bit); } } - int_div_rem_memory(cudaStream_t stream, uint32_t gpu_index, - int_radix_params params, uint32_t num_blocks, - bool allocate_gpu_memory) { + int_div_rem_memory(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params params, + uint32_t num_blocks, bool allocate_gpu_memory) { this->params = params; shift_mem_1 = new int_logical_scalar_shift_buffer( - stream, gpu_index, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, + streams, gpu_indexes, 1, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, 2 * num_blocks, true); shift_mem_2 = new int_logical_scalar_shift_buffer( - stream, gpu_index, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, + streams, gpu_indexes, 1, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, 2 * num_blocks, true); overflow_sub_mem = new int_overflowing_sub_memory( - stream, gpu_index, params, num_blocks, true); + streams, gpu_indexes, 1, params, num_blocks, true); - comparison_buffer = - new int_comparison_buffer(stream, gpu_index, COMPARISON_TYPE::NE, - params, num_blocks, false, true); + comparison_buffer = new int_comparison_buffer( + streams, gpu_indexes, 1, COMPARISON_TYPE::NE, params, num_blocks, false, + true); - init_lookup_tables(stream, gpu_index, num_blocks); - init_temporary_buffers(stream, gpu_index, num_blocks); + init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks); + init_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks); - sub_stream_1 = cuda_create_stream(gpu_index); - sub_stream_2 = cuda_create_stream(gpu_index); - sub_stream_3 = cuda_create_stream(gpu_index); - sub_stream_4 = cuda_create_stream(gpu_index); + sub_stream_1 = cuda_create_stream(gpu_indexes[0]); + sub_stream_2 = cuda_create_stream(gpu_indexes[0]); + sub_stream_3 = cuda_create_stream(gpu_indexes[0]); + sub_stream_4 = cuda_create_stream(gpu_indexes[0]); } - void release(cudaStream_t stream, uint32_t gpu_index) { + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus); // release and delete other operation memory objects - shift_mem_1->release(stream, gpu_index); - shift_mem_2->release(stream, gpu_index); - overflow_sub_mem->release(stream, gpu_index); - comparison_buffer->release(stream, gpu_index); + shift_mem_1->release(streams, gpu_indexes, 1); + shift_mem_2->release(streams, gpu_indexes, 1); + overflow_sub_mem->release(streams, gpu_indexes, 1); + comparison_buffer->release(streams, gpu_indexes, 1); delete shift_mem_1; delete shift_mem_2; delete overflow_sub_mem; @@ -2457,8 +2559,8 @@ template struct int_div_rem_memory { // masking_luts_1 and masking_luts_2 for (int i = 0; i < params.message_modulus - 1; i++) { - masking_luts_1[i]->release(stream, gpu_index); - masking_luts_2[i]->release(stream, gpu_index); + masking_luts_1[i]->release(streams, gpu_indexes, 1); + masking_luts_2[i]->release(streams, gpu_indexes, 1); delete masking_luts_1[i]; delete masking_luts_2[i]; @@ -2467,15 +2569,15 @@ template struct int_div_rem_memory { delete[] masking_luts_2; // message_extract_lut_1 and message_extract_lut_2 - message_extract_lut_1->release(stream, gpu_index); - message_extract_lut_2->release(stream, gpu_index); + message_extract_lut_1->release(streams, gpu_indexes, 1); + message_extract_lut_2->release(streams, gpu_indexes, 1); delete message_extract_lut_1; delete message_extract_lut_2; // zero_out_if_overflow_did_not_happen - zero_out_if_overflow_did_not_happen[0]->release(stream, gpu_index); - zero_out_if_overflow_did_not_happen[1]->release(stream, gpu_index); + zero_out_if_overflow_did_not_happen[0]->release(streams, gpu_indexes, 1); + zero_out_if_overflow_did_not_happen[1]->release(streams, gpu_indexes, 1); delete zero_out_if_overflow_did_not_happen[0]; delete zero_out_if_overflow_did_not_happen[1]; @@ -2483,8 +2585,8 @@ template struct int_div_rem_memory { delete[] zero_out_if_overflow_did_not_happen; // zero_out_if_overflow_happened - zero_out_if_overflow_happened[0]->release(stream, gpu_index); - zero_out_if_overflow_happened[1]->release(stream, gpu_index); + zero_out_if_overflow_happened[0]->release(streams, gpu_indexes, 1); + zero_out_if_overflow_happened[1]->release(streams, gpu_indexes, 1); delete zero_out_if_overflow_happened[0]; delete zero_out_if_overflow_happened[1]; @@ -2493,36 +2595,38 @@ template struct int_div_rem_memory { // merge_overflow_flags_luts for (int i = 0; i < num_bits_in_message; i++) { - merge_overflow_flags_luts[i]->release(stream, gpu_index); + merge_overflow_flags_luts[i]->release(streams, gpu_indexes, 1); delete merge_overflow_flags_luts[i]; } delete[] merge_overflow_flags_luts; // release sub streams - cuda_destroy_stream(sub_stream_1, gpu_index); - cuda_destroy_stream(sub_stream_2, gpu_index); - cuda_destroy_stream(sub_stream_3, gpu_index); - cuda_destroy_stream(sub_stream_4, gpu_index); + cuda_destroy_stream(sub_stream_1, gpu_indexes[0]); + cuda_destroy_stream(sub_stream_2, gpu_indexes[0]); + cuda_destroy_stream(sub_stream_3, gpu_indexes[0]); + cuda_destroy_stream(sub_stream_4, gpu_indexes[0]); // drop temporary buffers - cuda_drop_async(remainder1, stream, gpu_index); - cuda_drop_async(remainder2, stream, gpu_index); - cuda_drop_async(numerator_block_stack, stream, gpu_index); - cuda_drop_async(numerator_block_1, stream, gpu_index); - cuda_drop_async(tmp_radix, stream, gpu_index); - cuda_drop_async(interesting_remainder1, stream, gpu_index); - cuda_drop_async(interesting_remainder2, stream, gpu_index); - cuda_drop_async(interesting_divisor, stream, gpu_index); - cuda_drop_async(divisor_ms_blocks, stream, gpu_index); - cuda_drop_async(new_remainder, stream, gpu_index); - cuda_drop_async(subtraction_overflowed, stream, gpu_index); - cuda_drop_async(did_not_overflow, stream, gpu_index); - cuda_drop_async(overflow_sum, stream, gpu_index); - cuda_drop_async(overflow_sum_radix, stream, gpu_index); - cuda_drop_async(tmp_1, stream, gpu_index); - cuda_drop_async(at_least_one_upper_block_is_non_zero, stream, gpu_index); - cuda_drop_async(cleaned_merged_interesting_remainder, stream, gpu_index); + cuda_drop_async(remainder1, streams[0], gpu_indexes[0]); + cuda_drop_async(remainder2, streams[0], gpu_indexes[0]); + cuda_drop_async(numerator_block_stack, streams[0], gpu_indexes[0]); + cuda_drop_async(numerator_block_1, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_radix, streams[0], gpu_indexes[0]); + cuda_drop_async(interesting_remainder1, streams[0], gpu_indexes[0]); + cuda_drop_async(interesting_remainder2, streams[0], gpu_indexes[0]); + cuda_drop_async(interesting_divisor, streams[0], gpu_indexes[0]); + cuda_drop_async(divisor_ms_blocks, streams[0], gpu_indexes[0]); + cuda_drop_async(new_remainder, streams[0], gpu_indexes[0]); + cuda_drop_async(subtraction_overflowed, streams[0], gpu_indexes[0]); + cuda_drop_async(did_not_overflow, streams[0], gpu_indexes[0]); + cuda_drop_async(overflow_sum, streams[0], gpu_indexes[0]); + cuda_drop_async(overflow_sum_radix, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_1, streams[0], gpu_indexes[0]); + cuda_drop_async(at_least_one_upper_block_is_non_zero, streams[0], + gpu_indexes[0]); + cuda_drop_async(cleaned_merged_interesting_remainder, streams[0], + gpu_indexes[0]); } }; @@ -2531,9 +2635,9 @@ template struct int_bitop_buffer { int_radix_params params; int_radix_lut *lut; - int_bitop_buffer(cudaStream_t stream, uint32_t gpu_index, BITOP_TYPE op, - int_radix_params params, uint32_t num_radix_blocks, - bool allocate_gpu_memory) { + int_bitop_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, BITOP_TYPE op, int_radix_params params, + uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; @@ -2541,7 +2645,7 @@ template struct int_bitop_buffer { case BITAND: case BITOR: case BITXOR: - lut = new int_radix_lut(stream, gpu_index, params, 1, + lut = new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, num_radix_blocks, allocate_gpu_memory); { auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus { @@ -2558,20 +2662,20 @@ template struct int_bitop_buffer { }; generate_device_accumulator_bivariate( - stream, gpu_index, lut->lut, params.glwe_dimension, + streams[0], gpu_indexes[0], lut->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, lut_bivariate_f); } break; case BITNOT: - lut = new int_radix_lut(stream, gpu_index, params, 1, + lut = new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, num_radix_blocks, allocate_gpu_memory); { auto lut_not_f = [params](Torus x) -> Torus { return (~x) % params.message_modulus; }; generate_device_accumulator( - stream, gpu_index, lut->lut, params.glwe_dimension, + streams[0], gpu_indexes[0], lut->lut, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, lut_not_f); } @@ -2580,7 +2684,7 @@ template struct int_bitop_buffer { // Scalar OP uint32_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size; - lut = new int_radix_lut(stream, gpu_index, params, + lut = new int_radix_lut(streams, gpu_indexes, gpu_count, params, params.message_modulus, num_radix_blocks, allocate_gpu_memory); @@ -2601,15 +2705,16 @@ template struct int_bitop_buffer { } }; generate_device_accumulator( - stream, gpu_index, lut_block, params.glwe_dimension, + streams[0], gpu_indexes[0], lut_block, params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, lut_univariate_scalar_f); } } } - void release(cudaStream_t stream, uint32_t gpu_index) { - lut->release(stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + lut->release(streams, gpu_indexes, gpu_count); delete lut; } }; @@ -2621,9 +2726,9 @@ template struct int_scalar_mul_buffer { Torus *preshifted_buffer; Torus *all_shifted_buffer; - int_scalar_mul_buffer(cudaStream_t stream, uint32_t gpu_index, - int_radix_params params, uint32_t num_radix_blocks, - bool allocate_gpu_memory) { + int_scalar_mul_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params params, + uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; if (allocate_gpu_memory) { @@ -2636,35 +2741,36 @@ template struct int_scalar_mul_buffer { //// The idea is that with these we can create all other shift that are in //// range (0..total_bits) for free (block rotation) preshifted_buffer = (Torus *)cuda_malloc_async( - num_ciphertext_bits * lwe_size_bytes, stream, gpu_index); + num_ciphertext_bits * lwe_size_bytes, streams[0], gpu_indexes[0]); all_shifted_buffer = (Torus *)cuda_malloc_async( - num_ciphertext_bits * num_radix_blocks * lwe_size_bytes, stream, - gpu_index); + num_ciphertext_bits * num_radix_blocks * lwe_size_bytes, streams[0], + gpu_indexes[0]); cuda_memset_async(preshifted_buffer, 0, - num_ciphertext_bits * lwe_size_bytes, stream, - gpu_index); + num_ciphertext_bits * lwe_size_bytes, streams[0], + gpu_indexes[0]); cuda_memset_async(all_shifted_buffer, 0, num_ciphertext_bits * num_radix_blocks * lwe_size_bytes, - stream, gpu_index); + streams[0], gpu_indexes[0]); logical_scalar_shift_buffer = new int_logical_scalar_shift_buffer( - stream, gpu_index, LEFT_SHIFT, params, num_radix_blocks, + streams, gpu_indexes, gpu_count, LEFT_SHIFT, params, num_radix_blocks, allocate_gpu_memory, all_shifted_buffer); sum_ciphertexts_vec_mem = new int_sum_ciphertexts_vec_memory( - stream, gpu_index, params, num_radix_blocks, num_ciphertext_bits, - allocate_gpu_memory); + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + num_ciphertext_bits, allocate_gpu_memory); } } - void release(cudaStream_t stream, uint32_t gpu_index) { - logical_scalar_shift_buffer->release(stream, gpu_index); - sum_ciphertexts_vec_mem->release(stream, gpu_index); - cuda_drop_async(preshifted_buffer, stream, gpu_index); - cuda_drop_async(all_shifted_buffer, stream, gpu_index); + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count); + sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count); + cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]); + cuda_drop_async(all_shifted_buffer, streams[0], gpu_indexes[0]); } }; diff --git a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h index 924ec131f..70297f50f 100644 --- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h +++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h @@ -9,13 +9,15 @@ void cuda_keyswitch_lwe_ciphertext_vector_32( void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, uint32_t num_samples); + uint32_t base_log, uint32_t level_count, uint32_t num_samples, + uint32_t gpu_offset = 0); void cuda_keyswitch_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, uint32_t num_samples); + uint32_t base_log, uint32_t level_count, uint32_t num_samples, + uint32_t gpu_offset = 0); } #endif // CNCRT_KS_H_ diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h index bc85e9518..4a9a0c64a 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h +++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h @@ -51,7 +51,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory); + uint32_t max_shared_memory, uint32_t gpu_offset = 0); void cleanup_cuda_programmable_bootstrap_amortized(void *stream, uint32_t gpu_index, @@ -76,7 +76,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory); + uint32_t max_shared_memory, uint32_t gpu_offset = 0); void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -85,7 +85,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory); + uint32_t max_shared_memory, uint32_t gpu_offset = 0); void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index, int8_t **pbs_buffer); @@ -354,7 +354,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -364,7 +364,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0); #if (CUDA_ARCH >= 900) template @@ -375,7 +375,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0); template void scratch_cuda_programmable_bootstrap_tbc( diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h index 7e7530f9a..f0f9efabd 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h +++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h @@ -29,23 +29,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0); - -void scratch_cuda_generic_multi_bit_programmable_bootstrap_64( - void *stream, uint32_t gpu_index, int8_t **pbs_buffer, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0); - -void cuda_generic_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( - void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes, - void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, - int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0); + uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset, + uint32_t lwe_chunk_size = 0); void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, uint32_t gpu_index, @@ -80,7 +65,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size); + uint32_t gpu_offset, uint32_t lwe_chunk_size); #endif template @@ -107,7 +92,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size = 0); + uint32_t gpu_offset, uint32_t lwe_chunk_size = 0); template void scratch_cuda_multi_bit_programmable_bootstrap( @@ -126,7 +111,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size = 0); + uint32_t gpu_offset, uint32_t lwe_chunk_size = 0); template __host__ __device__ uint64_t diff --git a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt index 2a2f5175a..3190501fc 100644 --- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt +++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt @@ -11,7 +11,7 @@ set(SOURCES ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper.h) + ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h) file(GLOB_RECURSE SOURCES "*.cu") add_library(tfhe_cuda_backend STATIC ${SOURCES}) set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu index 524a1fa45..2373e418a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu @@ -9,14 +9,16 @@ void cuda_keyswitch_lwe_ciphertext_vector_32( void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, uint32_t num_samples) { + uint32_t base_log, uint32_t level_count, uint32_t num_samples, + uint32_t gpu_offset) { cuda_keyswitch_lwe_ciphertext_vector( static_cast(stream), gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(ksk), - lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples); + lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples, + gpu_offset); } /* Perform keyswitch on a batch of 64 bits input LWE ciphertexts. @@ -39,12 +41,14 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, uint32_t num_samples) { + uint32_t base_log, uint32_t level_count, uint32_t num_samples, + uint32_t gpu_offset) { cuda_keyswitch_lwe_ciphertext_vector( static_cast(stream), gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(ksk), - lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples); + lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples, + gpu_offset); } diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh index 989f9f253..964637afa 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh @@ -3,6 +3,7 @@ #include "device.h" #include "gadget.cuh" +#include "helper_multi_gpu.h" #include "polynomial/functions.cuh" #include "polynomial/polynomial_math.cuh" #include "torus.cuh" @@ -37,23 +38,26 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level, // threads in y are used to paralelize the lwe_dimension_in loop. // shared memory is used to store intermediate results of the reduction. template -__global__ void -keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in, - Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in, - uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count) { +__global__ void keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes, + Torus *lwe_array_in, Torus *lwe_input_indexes, + Torus *ksk, uint32_t lwe_dimension_in, + uint32_t lwe_dimension_out, uint32_t base_log, + uint32_t level_count, int gpu_offset) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; const int shmem_index = threadIdx.x + threadIdx.y * blockDim.x; extern __shared__ int8_t sharedmem[]; Torus *lwe_acc_out = (Torus *)sharedmem; - auto block_lwe_array_out = get_chunk( - lwe_array_out, lwe_output_indexes[blockIdx.y], lwe_dimension_out + 1); + auto block_lwe_array_out = + get_chunk(lwe_array_out, lwe_output_indexes[blockIdx.y + gpu_offset], + lwe_dimension_out + 1); if (tid <= lwe_dimension_out) { Torus local_lwe_out = 0; - auto block_lwe_array_in = get_chunk( - lwe_array_in, lwe_input_indexes[blockIdx.y], lwe_dimension_in + 1); + auto block_lwe_array_in = + get_chunk(lwe_array_in, lwe_input_indexes[blockIdx.y + gpu_offset], + lwe_dimension_in + 1); if (tid == lwe_dimension_out && threadIdx.y == 0) { local_lwe_out = block_lwe_array_in[lwe_dimension_in]; @@ -99,7 +103,8 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, uint32_t num_samples) { + uint32_t base_log, uint32_t level_count, uint32_t num_samples, + uint32_t gpu_offset = 0) { cudaSetDevice(gpu_index); @@ -115,8 +120,42 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector( keyswitch<<>>( lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk, - lwe_dimension_in, lwe_dimension_out, base_log, level_count); + lwe_dimension_in, lwe_dimension_out, base_log, level_count, gpu_offset); check_cuda_error(cudaGetLastError()); } +template +void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, + Torus *lwe_output_indexes, Torus *lwe_array_in, + Torus *lwe_input_indexes, Torus **ksks, + uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, + uint32_t base_log, uint32_t level_count, + uint32_t num_samples, bool sync_streams = true) { + + /// If the number of radix blocks is lower than the number of GPUs, not all + /// GPUs will be active and there will be 1 input per GPU + auto active_gpu_count = get_active_gpu_count(num_samples, gpu_count); + int num_samples_on_gpu_0 = get_num_inputs_on_gpu(num_samples, 0, gpu_count); + if (sync_streams) + cuda_synchronize_stream(streams[0], gpu_indexes[0]); +#pragma omp parallel for num_threads(active_gpu_count) + for (uint i = 0; i < active_gpu_count; i++) { + int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count); + int gpu_offset = get_gpu_offset(num_samples, i, gpu_count); + + // Compute Keyswitch + cuda_keyswitch_lwe_ciphertext_vector( + streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes, + lwe_array_in, lwe_input_indexes, ksks[i], lwe_dimension_in, + lwe_dimension_out, base_log, level_count, num_samples_on_gpu, + gpu_offset); + } + + if (sync_streams) + for (uint i = 0; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } +} + #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu index 8092bf8ab..7583da3c2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/device.cu +++ b/backends/tfhe-cuda-backend/cuda/src/device.cu @@ -6,7 +6,7 @@ cudaStream_t cuda_create_stream(uint32_t gpu_index) { check_cuda_error(cudaSetDevice(gpu_index)); cudaStream_t stream; - check_cuda_error(cudaStreamCreate(&stream)); + check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); return stream; } @@ -47,9 +47,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index)); if (support_async_alloc) { - cuda_synchronize_stream(stream, gpu_index); check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream)); - cuda_synchronize_stream(stream, gpu_index); } else { check_cuda_error(cudaMalloc((void **)&ptr, size)); } @@ -121,21 +119,22 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size, return; cudaPointerAttributes attr_dest; check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest)); - if (attr_dest.device != gpu_index && attr_dest.type != cudaMemoryTypeDevice) { + if (attr_dest.type != cudaMemoryTypeDevice) { PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.") } cudaPointerAttributes attr_src; check_cuda_error(cudaPointerGetAttributes(&attr_src, src)); - if (attr_src.device != gpu_index && attr_src.type != cudaMemoryTypeDevice) { + if (attr_src.type != cudaMemoryTypeDevice) { PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.") } - if (attr_src.device != attr_dest.device) { - PANIC("Cuda error: different devices specified in copy from GPU to GPU.") - } - check_cuda_error(cudaSetDevice(gpu_index)); - check_cuda_error( - cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream)); + if (attr_src.device == attr_dest.device) { + check_cuda_error( + cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream)); + } else { + check_cuda_error(cudaMemcpyPeerAsync(dest, attr_dest.device, src, + attr_src.device, size, stream)); + } } /// Synchronizes device diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu index 4318a9c6b..28607f458 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu @@ -1,13 +1,13 @@ #include "integer/bitwise_ops.cuh" void scratch_cuda_integer_radix_bitop_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t lwe_ciphertext_count, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type, - bool allocate_gpu_memory) { + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t lwe_ciphertext_count, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + BITOP_TYPE op_type, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, @@ -15,7 +15,7 @@ void scratch_cuda_integer_radix_bitop_kb_64( message_modulus, carry_modulus); scratch_cuda_integer_radix_bitop_kb( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_bitop_buffer **)mem_ptr, lwe_ciphertext_count, params, op_type, allocate_gpu_memory); } @@ -23,34 +23,34 @@ void scratch_cuda_integer_radix_bitop_kb_64( void cuda_bitop_integer_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr, - void *bsk, void *ksk, uint32_t lwe_ciphertext_count) { + void **bsks, void **ksks, uint32_t lwe_ciphertext_count) { host_integer_radix_bitop_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), static_cast(lwe_array_1), static_cast(lwe_array_2), - (int_bitop_buffer *)mem_ptr, bsk, static_cast(ksk), + (int_bitop_buffer *)mem_ptr, bsks, (uint64_t **)(ksks), lwe_ciphertext_count); } void cuda_bitnot_integer_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *lwe_array_in, int8_t *mem_ptr, void *bsk, - void *ksk, uint32_t lwe_ciphertext_count) { + void *lwe_array_out, void *lwe_array_in, int8_t *mem_ptr, void **bsks, + void **ksks, uint32_t lwe_ciphertext_count) { host_integer_radix_bitnot_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), static_cast(lwe_array_in), - (int_bitop_buffer *)mem_ptr, bsk, static_cast(ksk), + (int_bitop_buffer *)mem_ptr, bsks, (uint64_t **)(ksks), lwe_ciphertext_count); } -void cleanup_cuda_integer_bitop(void *stream, uint32_t gpu_index, - int8_t **mem_ptr_void) { +void cleanup_cuda_integer_bitop(void **streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void) { int_bitop_buffer *mem_ptr = (int_bitop_buffer *)(*mem_ptr_void); - mem_ptr->release(static_cast(stream), gpu_index); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh index fe0b3e58d..6ad42c248 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh @@ -16,38 +16,38 @@ __host__ void host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, - int_bitop_buffer *mem_ptr, void *bsk, - Torus *ksk, uint32_t num_radix_blocks) { + int_bitop_buffer *mem_ptr, void **bsks, + Torus **ksks, uint32_t num_radix_blocks) { auto lut = mem_ptr->lut; integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2, - bsk, ksk, num_radix_blocks, lut, lut->params.message_modulus); + bsks, ksks, num_radix_blocks, lut, lut->params.message_modulus); } template __host__ void host_integer_radix_bitnot_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, int_bitop_buffer *mem_ptr, - void *bsk, Torus *ksk, uint32_t num_radix_blocks) { + void **bsks, Torus **ksks, uint32_t num_radix_blocks) { auto lut = mem_ptr->lut; integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsk, ksk, + streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks, num_radix_blocks, lut); } template __host__ void scratch_cuda_integer_radix_bitop_kb( - cudaStream_t stream, uint32_t gpu_index, int_bitop_buffer **mem_ptr, - uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op, - bool allocate_gpu_memory) { + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int_bitop_buffer **mem_ptr, uint32_t num_radix_blocks, + int_radix_params params, BITOP_TYPE op, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); - *mem_ptr = new int_bitop_buffer(stream, gpu_index, op, params, - num_radix_blocks, allocate_gpu_memory); + *mem_ptr = + new int_bitop_buffer(streams, gpu_indexes, gpu_count, op, params, + num_radix_blocks, allocate_gpu_memory); } #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu index ae6d5d84f..28f685b6e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu @@ -1,12 +1,13 @@ #include "integer/cmux.cuh" void scratch_cuda_integer_radix_cmux_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t lwe_ciphertext_count, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) { + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t lwe_ciphertext_count, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, @@ -17,7 +18,7 @@ void scratch_cuda_integer_radix_cmux_kb_64( [](uint64_t x) -> uint64_t { return x == 1; }; scratch_cuda_integer_radix_cmux_kb( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_cmux_buffer **)mem_ptr, predicate_lut_f, lwe_ciphertext_count, params, allocate_gpu_memory); } @@ -25,7 +26,7 @@ void scratch_cuda_integer_radix_cmux_kb_64( void cuda_cmux_integer_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array_out, void *lwe_condition, void *lwe_array_true, - void *lwe_array_false, int8_t *mem_ptr, void *bsk, void *ksk, + void *lwe_array_false, int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_ciphertext_count) { host_integer_radix_cmux_kb( @@ -34,15 +35,16 @@ void cuda_cmux_integer_radix_ciphertext_kb_64( static_cast(lwe_condition), static_cast(lwe_array_true), static_cast(lwe_array_false), - (int_cmux_buffer *)mem_ptr, bsk, static_cast(ksk), + (int_cmux_buffer *)mem_ptr, bsks, (uint64_t **)(ksks), lwe_ciphertext_count); } -void cleanup_cuda_integer_radix_cmux(void *stream, uint32_t gpu_index, +void cleanup_cuda_integer_radix_cmux(void **streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void) { int_cmux_buffer *mem_ptr = (int_cmux_buffer *)(*mem_ptr_void); - mem_ptr->release(static_cast(stream), gpu_index); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh index 92aa5fe02..ed9860d68 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh @@ -9,8 +9,8 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_input, Torus *lwe_condition, int_zero_out_if_buffer *mem_ptr, - int_radix_lut *predicate, void *bsk, - Torus *ksk, uint32_t num_radix_blocks) { + int_radix_lut *predicate, void **bsks, + Torus **ksks, uint32_t num_radix_blocks) { cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; @@ -36,18 +36,17 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes, } integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsk, - ksk, num_radix_blocks, predicate); + streams, gpu_indexes, 1, lwe_array_out, tmp_lwe_array_input, bsks, ksks, + num_radix_blocks, predicate); } template __host__ void host_integer_radix_cmux_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_condition, Torus *lwe_array_true, - Torus *lwe_array_false, int_cmux_buffer *mem_ptr, void *bsk, - Torus *ksk, uint32_t num_radix_blocks) { + Torus *lwe_array_false, int_cmux_buffer *mem_ptr, void **bsks, + Torus **ksks, uint32_t num_radix_blocks) { - cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; // Since our CPU threads will be working on different streams we shall assert @@ -62,16 +61,17 @@ __host__ void host_integer_radix_cmux_kb( #pragma omp section { auto mem_true = mem_ptr->zero_if_true_buffer; - zero_out_if(&true_stream, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct, + zero_out_if(&true_stream, gpu_indexes, 1, mem_ptr->tmp_true_ct, lwe_array_true, lwe_condition, mem_true, - mem_ptr->inverted_predicate_lut, bsk, ksk, num_radix_blocks); + mem_ptr->inverted_predicate_lut, bsks, ksks, + num_radix_blocks); } #pragma omp section { auto mem_false = mem_ptr->zero_if_false_buffer; - zero_out_if(&false_stream, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct, + zero_out_if(&false_stream, gpu_indexes, 1, mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition, mem_false, - mem_ptr->predicate_lut, bsk, ksk, num_radix_blocks); + mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks); } } cuda_synchronize_stream(true_stream, gpu_indexes[0]); @@ -86,19 +86,19 @@ __host__ void host_integer_radix_cmux_kb( num_radix_blocks); integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsk, ksk, + streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks, num_radix_blocks, mem_ptr->message_extract_lut); } template __host__ void scratch_cuda_integer_radix_cmux_kb( - cudaStream_t stream, uint32_t gpu_index, int_cmux_buffer **mem_ptr, + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int_cmux_buffer **mem_ptr, std::function predicate_lut_f, uint32_t num_radix_blocks, int_radix_params params, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); - *mem_ptr = - new int_cmux_buffer(stream, gpu_index, predicate_lut_f, params, - num_radix_blocks, allocate_gpu_memory); + *mem_ptr = new int_cmux_buffer(streams, gpu_indexes, gpu_count, + predicate_lut_f, params, + num_radix_blocks, allocate_gpu_memory); } #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu index 9323efd4c..7f877722b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu @@ -1,13 +1,13 @@ #include "integer/comparison.cuh" void scratch_cuda_integer_radix_comparison_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, COMPARISON_TYPE op_type, bool is_signed, - bool allocate_gpu_memory) { + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_radix_blocks, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, @@ -18,7 +18,7 @@ void scratch_cuda_integer_radix_comparison_kb_64( case EQ: case NE: scratch_cuda_integer_radix_comparison_check_kb( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_comparison_buffer **)mem_ptr, num_radix_blocks, params, op_type, false, allocate_gpu_memory); break; @@ -29,7 +29,7 @@ void scratch_cuda_integer_radix_comparison_kb_64( case MAX: case MIN: scratch_cuda_integer_radix_comparison_check_kb( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_comparison_buffer **)mem_ptr, num_radix_blocks, params, op_type, is_signed, allocate_gpu_memory); break; @@ -39,7 +39,7 @@ void scratch_cuda_integer_radix_comparison_kb_64( void cuda_comparison_integer_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr, - void *bsk, void *ksk, uint32_t num_radix_blocks) { + void **bsks, void **ksks, uint32_t num_radix_blocks) { int_comparison_buffer *buffer = (int_comparison_buffer *)mem_ptr; @@ -50,8 +50,8 @@ void cuda_comparison_integer_radix_ciphertext_kb_64( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), static_cast(lwe_array_1), - static_cast(lwe_array_2), buffer, bsk, - static_cast(ksk), num_radix_blocks); + static_cast(lwe_array_2), buffer, bsks, (uint64_t **)(ksks), + num_radix_blocks); break; case GT: case GE: @@ -62,7 +62,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64( static_cast(lwe_array_out), static_cast(lwe_array_1), static_cast(lwe_array_2), buffer, - buffer->diff_buffer->operator_f, bsk, static_cast(ksk), + buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks), num_radix_blocks); break; case MAX: @@ -71,18 +71,19 @@ void cuda_comparison_integer_radix_ciphertext_kb_64( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), static_cast(lwe_array_1), - static_cast(lwe_array_2), buffer, bsk, - static_cast(ksk), num_radix_blocks); + static_cast(lwe_array_2), buffer, bsks, (uint64_t **)(ksks), + num_radix_blocks); break; default: PANIC("Cuda error: integer operation not supported") } } -void cleanup_cuda_integer_comparison(void *stream, uint32_t gpu_index, +void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void) { int_comparison_buffer *mem_ptr = (int_comparison_buffer *)(*mem_ptr_void); - mem_ptr->release(static_cast(stream), gpu_index); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh index 35ff310a0..6df619242 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh @@ -56,12 +56,11 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index, * */ template -__host__ void -are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *lwe_array_out, - Torus *lwe_array_in, - int_comparison_buffer *mem_ptr, void *bsk, - Torus *ksk, uint32_t num_radix_blocks) { +__host__ void are_all_comparisons_block_true( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + Torus *lwe_array_out, Torus *lwe_array_in, + int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, + uint32_t num_radix_blocks) { cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; @@ -94,6 +93,8 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes, // as in the worst case we will be adding `max_value` ones auto input_blocks = tmp_out; auto accumulator = are_all_block_true_buffer->tmp_block_accumulated; + auto is_equal_to_num_blocks_map = + &are_all_block_true_buffer->is_equal_to_lut_map; for (int i = 0; i < num_chunks; i++) { accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator, input_blocks, big_lwe_dimension, chunk_length); @@ -103,8 +104,6 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes, input_blocks += (big_lwe_dimension + 1) * chunk_length; } accumulator = are_all_block_true_buffer->tmp_block_accumulated; - auto is_equal_to_num_blocks_map = - &are_all_block_true_buffer->is_equal_to_lut_map; // Selects a LUT int_radix_lut *lut; @@ -118,9 +117,8 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes, lut = (*is_equal_to_num_blocks_map)[chunk_length]; } else { // LUT needs to be computed - auto new_lut = - new int_radix_lut(streams[0], gpu_indexes[0], params, - max_value, num_radix_blocks, true); + auto new_lut = new int_radix_lut( + streams, gpu_indexes, 1, params, max_value, num_radix_blocks, true); auto is_equal_to_num_blocks_lut_f = [max_value, chunk_length](Torus x) -> Torus { @@ -140,13 +138,13 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes, if (remaining_blocks == 1) { // In the last iteration we copy the output to the final address integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsk, ksk, - 1, lut); + streams, gpu_indexes, 1, lwe_array_out, accumulator, bsks, ksks, 1, + lut); return; } else { integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsk, ksk, - num_chunks, lut); + streams, gpu_indexes, 1, tmp_out, accumulator, bsks, ksks, num_chunks, + lut); } } } @@ -161,7 +159,7 @@ template __host__ void is_at_least_one_comparisons_block_true( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, - int_comparison_buffer *mem_ptr, void *bsk, Torus *ksk, + int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, uint32_t num_radix_blocks) { cudaSetDevice(gpu_indexes[0]); @@ -207,13 +205,13 @@ __host__ void is_at_least_one_comparisons_block_true( if (remaining_blocks == 1) { // In the last iteration we copy the output to the final address integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsk, ksk, - 1, lut); + streams, gpu_indexes, 1, lwe_array_out, accumulator, bsks, ksks, 1, + lut); return; } else { integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out, - accumulator, bsk, ksk, num_chunks, lut); + streams, gpu_indexes, 1, mem_ptr->tmp_lwe_array_out, accumulator, + bsks, ksks, num_chunks, lut); } } } @@ -241,7 +239,7 @@ template __host__ void host_compare_with_zero_equality( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, - int_comparison_buffer *mem_ptr, void *bsk, Torus *ksk, + int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, int32_t num_radix_blocks, int_radix_lut *zero_comparison) { cudaSetDevice(gpu_indexes[0]); @@ -293,17 +291,17 @@ __host__ void host_compare_with_zero_equality( } integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, sum, sum, bsk, ksk, num_sum_blocks, + streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks, zero_comparison); are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out, - sum, mem_ptr, bsk, ksk, num_sum_blocks); + sum, mem_ptr, bsks, ksks, num_sum_blocks); } template __host__ void host_integer_radix_equality_check_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, - int_comparison_buffer *mem_ptr, void *bsk, Torus *ksk, + int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, uint32_t num_radix_blocks) { cudaSetDevice(gpu_indexes[0]); @@ -313,7 +311,7 @@ __host__ void host_integer_radix_equality_check_kb( auto comparisons = mem_ptr->tmp_block_comparisons; integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2, - bsk, ksk, num_radix_blocks, eq_buffer->operator_lut, + bsks, ksks, num_radix_blocks, eq_buffer->operator_lut, eq_buffer->operator_lut->params.message_modulus); // This takes a Vec of blocks, where each block is either 0 or 1. @@ -321,7 +319,7 @@ __host__ void host_integer_radix_equality_check_kb( // It returns a block encrypting 1 if all input blocks are 1 // otherwise the block encrypts 0 are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out, - comparisons, mem_ptr, bsk, ksk, + comparisons, mem_ptr, bsks, ksks, num_radix_blocks); } @@ -330,10 +328,9 @@ __host__ void compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_left, Torus *lwe_array_right, - int_comparison_buffer *mem_ptr, void *bsk, - Torus *ksk, uint32_t num_radix_blocks) { + int_comparison_buffer *mem_ptr, void **bsks, + Torus **ksks, uint32_t num_radix_blocks) { - cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; auto message_modulus = params.message_modulus; @@ -360,7 +357,7 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // Apply LUT to compare to 0 auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut; integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsk, ksk, + streams, gpu_indexes, 1, lwe_array_out, lwe_array_out, bsks, ksks, num_radix_blocks, is_non_zero_lut); // Add one @@ -380,10 +377,9 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_block_comparisons, int_tree_sign_reduction_buffer *tree_buffer, - std::function sign_handler_f, void *bsk, - Torus *ksk, uint32_t num_radix_blocks) { + std::function sign_handler_f, void **bsks, + Torus **ksks, uint32_t num_radix_blocks) { - cudaSetDevice(gpu_indexes[0]); auto params = tree_buffer->params; auto big_lwe_dimension = params.big_lwe_dimension; auto glwe_dimension = params.glwe_dimension; @@ -413,7 +409,7 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes, partial_block_count, 4); integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, x, y, bsk, ksk, + streams, gpu_indexes, gpu_count, x, y, bsks, ksks, partial_block_count >> 1, inner_tree_leaf); if ((partial_block_count % 2) != 0) { @@ -456,8 +452,9 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes, message_modulus, carry_modulus, f); // Last leaf - integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_array_out, y, bsk, ksk, 1, last_lut); + integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes, + gpu_count, lwe_array_out, y, + bsks, ksks, 1, last_lut); } template @@ -465,10 +462,9 @@ __host__ void host_integer_radix_difference_check_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_left, Torus *lwe_array_right, int_comparison_buffer *mem_ptr, - std::function reduction_lut_f, void *bsk, Torus *ksk, + std::function reduction_lut_f, void **bsks, Torus **ksks, uint32_t num_radix_blocks) { - cudaSetDevice(gpu_indexes[0]); auto diff_buffer = mem_ptr->diff_buffer; auto params = mem_ptr->params; @@ -500,10 +496,10 @@ __host__ void host_integer_radix_difference_check_kb( // Clean noise auto identity_lut = mem_ptr->identity_lut; integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, packed_left, packed_left, bsk, ksk, + streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks, packed_num_radix_blocks, identity_lut); integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, packed_right, packed_right, bsk, ksk, + streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks, packed_num_radix_blocks, identity_lut); lhs = packed_left; @@ -520,14 +516,15 @@ __host__ void host_integer_radix_difference_check_kb( // Compare packed blocks, or simply the total number of radix blocks in the // inputs compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs, - rhs, mem_ptr, bsk, ksk, packed_num_radix_blocks); + rhs, mem_ptr, bsks, ksks, packed_num_radix_blocks); num_comparisons = packed_num_radix_blocks; } else { // Packing is possible if (carry_modulus >= message_modulus) { // Compare (num_radix_blocks - 2) / 2 packed blocks compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs, - rhs, mem_ptr, bsk, ksk, packed_num_radix_blocks); + rhs, mem_ptr, bsks, ksks, + packed_num_radix_blocks); // Compare the last block before the sign block separately auto identity_lut = mem_ptr->identity_lut; @@ -538,37 +535,37 @@ __host__ void host_integer_radix_difference_check_kb( packed_num_radix_blocks * big_lwe_size; integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, last_left_block_before_sign_block, - lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsk, ksk, 1, + lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1, identity_lut); integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, last_right_block_before_sign_block, - lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsk, ksk, 1, - identity_lut); + lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, + 1, identity_lut); compare_radix_blocks_kb( streams, gpu_indexes, gpu_count, comparisons + packed_num_radix_blocks * big_lwe_size, last_left_block_before_sign_block, last_right_block_before_sign_block, - mem_ptr, bsk, ksk, 1); + mem_ptr, bsks, ksks, 1); // Compare the sign block separately integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, comparisons + (packed_num_radix_blocks + 1) * big_lwe_size, lwe_array_left + (num_radix_blocks - 1) * big_lwe_size, - lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsk, ksk, 1, - mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus); + lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsks, ksks, + 1, mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus); num_comparisons = packed_num_radix_blocks + 2; } else { compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, - lwe_array_left, lwe_array_right, mem_ptr, bsk, - ksk, num_radix_blocks - 1); + lwe_array_left, lwe_array_right, mem_ptr, bsks, + ksks, num_radix_blocks - 1); // Compare the sign block separately integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, comparisons + (num_radix_blocks - 1) * big_lwe_size, lwe_array_left + (num_radix_blocks - 1) * big_lwe_size, - lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsk, ksk, 1, - mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus); + lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsks, ksks, + 1, mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus); num_comparisons = num_radix_blocks; } } @@ -578,20 +575,19 @@ __host__ void host_integer_radix_difference_check_kb( // final sign tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out, comparisons, mem_ptr->diff_buffer->tree_buffer, - reduction_lut_f, bsk, ksk, num_comparisons); + reduction_lut_f, bsks, ksks, num_comparisons); } template __host__ void scratch_cuda_integer_radix_comparison_check_kb( - cudaStream_t stream, uint32_t gpu_index, + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_comparison_buffer **mem_ptr, uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op, bool is_signed, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); - *mem_ptr = new int_comparison_buffer(stream, gpu_index, op, params, - num_radix_blocks, is_signed, - allocate_gpu_memory); + *mem_ptr = new int_comparison_buffer(streams, gpu_indexes, gpu_count, + op, params, num_radix_blocks, + is_signed, allocate_gpu_memory); } template @@ -599,20 +595,19 @@ __host__ void host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_left, Torus *lwe_array_right, - int_comparison_buffer *mem_ptr, void *bsk, - Torus *ksk, uint32_t total_num_radix_blocks) { + int_comparison_buffer *mem_ptr, void **bsks, + Torus **ksks, uint32_t total_num_radix_blocks) { - cudaSetDevice(gpu_indexes[0]); // Compute the sign host_integer_radix_difference_check_kb( streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out, - lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsk, - ksk, total_num_radix_blocks); + lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks, + ksks, total_num_radix_blocks); // Selector host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left, - lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, + lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu index 63fb1e0e1..79ca59553 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu @@ -1,12 +1,12 @@ #include "integer/div_rem.cuh" void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, bool allocate_gpu_memory) { + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, @@ -14,15 +14,15 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( message_modulus, carry_modulus); scratch_cuda_integer_div_rem_kb( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_div_rem_memory **)mem_ptr, num_blocks, params, allocate_gpu_memory); } void cuda_integer_div_rem_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient, - void *remainder, void *numerator, void *divisor, int8_t *mem_ptr, void *bsk, - void *ksk, uint32_t num_blocks) { + void *remainder, void *numerator, void *divisor, int8_t *mem_ptr, + void **bsks, void **ksks, uint32_t num_blocks) { auto mem = (int_div_rem_memory *)mem_ptr; @@ -32,7 +32,7 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(quotient), static_cast(remainder), static_cast(numerator), static_cast(divisor), - bsk, static_cast(ksk), mem, num_blocks); + bsks, (uint64_t **)(ksks), mem, num_blocks); break; case 1024: @@ -40,35 +40,35 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(quotient), static_cast(remainder), static_cast(numerator), static_cast(divisor), - bsk, static_cast(ksk), mem, num_blocks); + bsks, (uint64_t **)(ksks), mem, num_blocks); break; case 2048: host_integer_div_rem_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(quotient), static_cast(remainder), static_cast(numerator), static_cast(divisor), - bsk, static_cast(ksk), mem, num_blocks); + bsks, (uint64_t **)(ksks), mem, num_blocks); break; case 4096: host_integer_div_rem_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(quotient), static_cast(remainder), static_cast(numerator), static_cast(divisor), - bsk, static_cast(ksk), mem, num_blocks); + bsks, (uint64_t **)(ksks), mem, num_blocks); break; case 8192: host_integer_div_rem_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(quotient), static_cast(remainder), static_cast(numerator), static_cast(divisor), - bsk, static_cast(ksk), mem, num_blocks); + bsks, (uint64_t **)(ksks), mem, num_blocks); break; case 16384: host_integer_div_rem_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(quotient), static_cast(remainder), static_cast(numerator), static_cast(divisor), - bsk, static_cast(ksk), mem, num_blocks); + bsks, (uint64_t **)(ksks), mem, num_blocks); break; default: PANIC("Cuda error (integer div_rem): unsupported polynomial size. " @@ -76,10 +76,10 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64( } } -void cleanup_cuda_integer_div_rem(void *stream, uint32_t gpu_index, - int8_t **mem_ptr_void) { +void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void) { int_div_rem_memory *mem_ptr = (int_div_rem_memory *)(*mem_ptr_void); - mem_ptr->release(static_cast(stream), gpu_index); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh index 76cbea2e7..2fc1723ca 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh @@ -164,22 +164,21 @@ template struct lwe_ciphertext_list { }; template -__host__ void -scratch_cuda_integer_div_rem_kb(cudaStream_t stream, uint32_t gpu_index, - int_div_rem_memory **mem_ptr, - uint32_t num_blocks, int_radix_params params, - bool allocate_gpu_memory) { +__host__ void scratch_cuda_integer_div_rem_kb( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int_div_rem_memory **mem_ptr, uint32_t num_blocks, + int_radix_params params, bool allocate_gpu_memory) { - *mem_ptr = new int_div_rem_memory(stream, gpu_index, params, - num_blocks, allocate_gpu_memory); + *mem_ptr = new int_div_rem_memory( + streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory); } template __host__ void host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *quotient, Torus *remainder, - Torus *numerator, Torus *divisor, void *bsk, - uint64_t *ksk, int_div_rem_memory *mem_ptr, + Torus *numerator, Torus *divisor, void **bsks, + uint64_t **ksks, int_div_rem_memory *mem_ptr, uint32_t num_blocks) { auto radix_params = mem_ptr->params; @@ -289,8 +288,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t shifted_mask = full_message_mask >> shift_amount; integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, interesting_divisor.last_block(), - interesting_divisor.last_block(), bsk, ksk, 1, + streams, gpu_indexes, 1, interesting_divisor.last_block(), + interesting_divisor.last_block(), bsks, ksks, 1, mem_ptr->masking_luts_1[shifted_mask]); }; // trim_last_interesting_divisor_bits @@ -317,8 +316,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, shifted_mask = shifted_mask & full_message_mask; integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(), - divisor_ms_blocks.first_block(), bsk, ksk, 1, + streams, gpu_indexes, 1, divisor_ms_blocks.first_block(), + divisor_ms_blocks.first_block(), bsks, ksks, 1, mem_ptr->masking_luts_2[shifted_mask]); }; // trim_first_divisor_ms_bits @@ -341,15 +340,15 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, streams[0], gpu_indexes[0]); host_integer_radix_logical_scalar_shift_kb_inplace( - streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1, - mem_ptr->shift_mem_1, bsk, ksk, interesting_remainder1.len); + streams, gpu_indexes, 1, interesting_remainder1.data, 1, + mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len); tmp_radix.clone_from(interesting_remainder1, 0, interesting_remainder1.len - 1, streams[0], gpu_indexes[0]); host_radix_blocks_rotate_left( - streams, gpu_indexes, gpu_count, interesting_remainder1.data, + streams, gpu_indexes, 1, interesting_remainder1.data, tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size); numerator_block_1.clone_from( @@ -370,8 +369,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, auto left_shift_interesting_remainder2 = [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { host_integer_radix_logical_scalar_shift_kb_inplace( - streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1, - mem_ptr->shift_mem_2, bsk, ksk, interesting_remainder2.len); + streams, gpu_indexes, 1, interesting_remainder2.data, 1, + mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len); }; // left_shift_interesting_remainder2 cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -380,26 +379,26 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, #pragma omp section { // interesting_divisor - trim_last_interesting_divisor_bits(&mem_ptr->sub_stream_1, - &gpu_indexes[0], 1); + trim_last_interesting_divisor_bits(&mem_ptr->sub_stream_1, gpu_indexes, + 1); } #pragma omp section { // divisor_ms_blocks - trim_first_divisor_ms_bits(&mem_ptr->sub_stream_2, &gpu_indexes[0], 1); + trim_first_divisor_ms_bits(&mem_ptr->sub_stream_2, gpu_indexes, 1); } #pragma omp section { // interesting_remainder1 // numerator_block_stack - left_shift_interesting_remainder1(&mem_ptr->sub_stream_3, - &gpu_indexes[0], 1); + left_shift_interesting_remainder1(&mem_ptr->sub_stream_3, gpu_indexes, + 1); } #pragma omp section { // interesting_remainder2 - left_shift_interesting_remainder2(&mem_ptr->sub_stream_4, - &gpu_indexes[0], 1); + left_shift_interesting_remainder2(&mem_ptr->sub_stream_4, gpu_indexes, + 1); } } cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]); @@ -436,9 +435,9 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, auto do_overflowing_sub = [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { host_integer_overflowing_sub_kb( - streams, gpu_indexes, gpu_count, new_remainder.data, + streams, gpu_indexes, 1, new_remainder.data, subtraction_overflowed.data, merged_interesting_remainder.data, - interesting_divisor.data, bsk, ksk, mem_ptr->overflow_sub_mem, + interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem, merged_interesting_remainder.len); }; @@ -457,17 +456,16 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // But we are in the special case where scalar == 0 // So we can skip some stuff host_compare_with_zero_equality( - streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data, - mem_ptr->comparison_buffer, bsk, ksk, trivial_blocks.len, + streams, gpu_indexes, 1, tmp_1.data, trivial_blocks.data, + mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len, mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut); tmp_1.len = ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1); is_at_least_one_comparisons_block_true( - streams, gpu_indexes, gpu_count, - at_least_one_upper_block_is_non_zero.data, tmp_1.data, - mem_ptr->comparison_buffer, bsk, ksk, tmp_1.len); + streams, gpu_indexes, 1, at_least_one_upper_block_is_non_zero.data, + tmp_1.data, mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len); } }; @@ -475,15 +473,15 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // so that it can be safely used in bivariate PBSes // fills: // `cleaned_merged_interesting_remainder` - radix ciphertext - auto create_clean_version_of_merged_remainder = - [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { - integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, - cleaned_merged_interesting_remainder.data, - cleaned_merged_interesting_remainder.data, bsk, ksk, - cleaned_merged_interesting_remainder.len, - mem_ptr->message_extract_lut_1); - }; + auto create_clean_version_of_merged_remainder = [&](cudaStream_t *streams, + uint32_t *gpu_indexes, + uint32_t gpu_count) { + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, 1, cleaned_merged_interesting_remainder.data, + cleaned_merged_interesting_remainder.data, bsks, ksks, + cleaned_merged_interesting_remainder.len, + mem_ptr->message_extract_lut_1); + }; // phase 2 cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -493,18 +491,18 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, { // new_remainder // subtraction_overflowed - do_overflowing_sub(&mem_ptr->sub_stream_1, &gpu_indexes[0], 1); + do_overflowing_sub(&mem_ptr->sub_stream_1, gpu_indexes, 1); } #pragma omp section { // at_least_one_upper_block_is_non_zero - check_divisor_upper_blocks(&mem_ptr->sub_stream_2, &gpu_indexes[0], 1); + check_divisor_upper_blocks(&mem_ptr->sub_stream_2, gpu_indexes, 1); } #pragma omp section { // cleaned_merged_interesting_remainder create_clean_version_of_merged_remainder(&mem_ptr->sub_stream_3, - &gpu_indexes[0], 1); + gpu_indexes, 1); } } cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]); @@ -525,10 +523,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, auto conditionally_zero_out_merged_interesting_remainder = [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { integer_radix_apply_bivariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, + streams, gpu_indexes, 1, cleaned_merged_interesting_remainder.data, cleaned_merged_interesting_remainder.data, - overflow_sum_radix.data, bsk, ksk, + overflow_sum_radix.data, bsks, ksks, cleaned_merged_interesting_remainder.len, mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id], factor); @@ -537,18 +535,17 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, auto conditionally_zero_out_merged_new_remainder = [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { integer_radix_apply_bivariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, new_remainder.data, - new_remainder.data, overflow_sum_radix.data, bsk, ksk, - new_remainder.len, + streams, gpu_indexes, 1, new_remainder.data, new_remainder.data, + overflow_sum_radix.data, bsks, ksks, new_remainder.len, mem_ptr->zero_out_if_overflow_happened[factor_lut_id], factor); }; auto set_quotient_bit = [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { integer_radix_apply_bivariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, did_not_overflow.data, + streams, gpu_indexes, 1, did_not_overflow.data, subtraction_overflowed.data, - at_least_one_upper_block_is_non_zero.data, bsk, ksk, 1, + at_least_one_upper_block_is_non_zero.data, bsks, ksks, 1, mem_ptr->merge_overflow_flags_luts[pos_in_block], mem_ptr->merge_overflow_flags_luts[pos_in_block] ->params.message_modulus); @@ -566,18 +563,18 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, { // cleaned_merged_interesting_remainder conditionally_zero_out_merged_interesting_remainder( - &mem_ptr->sub_stream_1, &gpu_indexes[0], 1); + &mem_ptr->sub_stream_1, gpu_indexes, 1); } #pragma omp section { // new_remainder conditionally_zero_out_merged_new_remainder(&mem_ptr->sub_stream_2, - &gpu_indexes[0], 1); + gpu_indexes, 1); } #pragma omp section { // quotient - set_quotient_bit(&mem_ptr->sub_stream_3, &gpu_indexes[0], 1); + set_quotient_bit(&mem_ptr->sub_stream_3, gpu_indexes, 1); } } cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]); @@ -607,14 +604,14 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, #pragma omp section { integer_radix_apply_univariate_lookup_table_kb( - &mem_ptr->sub_stream_1, &gpu_indexes[0], 1, remainder, remainder, bsk, - ksk, num_blocks, mem_ptr->message_extract_lut_1); + &mem_ptr->sub_stream_1, gpu_indexes, 1, remainder, remainder, bsks, + ksks, num_blocks, mem_ptr->message_extract_lut_1); } #pragma omp section { integer_radix_apply_univariate_lookup_table_kb( - &mem_ptr->sub_stream_2, &gpu_indexes[0], 1, quotient, quotient, bsk, - ksk, num_blocks, mem_ptr->message_extract_lut_2); + &mem_ptr->sub_stream_2, gpu_indexes, 1, quotient, quotient, bsks, + ksks, num_blocks, mem_ptr->message_extract_lut_2); } } cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu index 5ddfeffc9..d6fcc6b3d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu @@ -3,7 +3,7 @@ void cuda_full_propagation_64_inplace( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *input_blocks, int8_t *mem_ptr, void *ksk, void *bsk, + void *input_blocks, int8_t *mem_ptr, void **ksks, void **bsks, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_base_log, uint32_t ks_level, uint32_t pbs_base_log, uint32_t pbs_level, uint32_t grouping_factor, uint32_t num_blocks) { @@ -13,57 +13,57 @@ void cuda_full_propagation_64_inplace( host_full_propagate_inplace>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(input_blocks), - (int_fullprop_buffer *)mem_ptr, static_cast(ksk), - bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log, - ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks); + (int_fullprop_buffer *)mem_ptr, (uint64_t **)(ksks), bsks, + lwe_dimension, glwe_dimension, polynomial_size, ks_base_log, ks_level, + pbs_base_log, pbs_level, grouping_factor, num_blocks); break; case 512: host_full_propagate_inplace>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(input_blocks), - (int_fullprop_buffer *)mem_ptr, static_cast(ksk), - bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log, - ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks); + (int_fullprop_buffer *)mem_ptr, (uint64_t **)(ksks), bsks, + lwe_dimension, glwe_dimension, polynomial_size, ks_base_log, ks_level, + pbs_base_log, pbs_level, grouping_factor, num_blocks); break; case 1024: host_full_propagate_inplace>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(input_blocks), - (int_fullprop_buffer *)mem_ptr, static_cast(ksk), - bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log, - ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks); + (int_fullprop_buffer *)mem_ptr, (uint64_t **)(ksks), bsks, + lwe_dimension, glwe_dimension, polynomial_size, ks_base_log, ks_level, + pbs_base_log, pbs_level, grouping_factor, num_blocks); break; case 2048: host_full_propagate_inplace>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(input_blocks), - (int_fullprop_buffer *)mem_ptr, static_cast(ksk), - bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log, - ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks); + (int_fullprop_buffer *)mem_ptr, (uint64_t **)(ksks), bsks, + lwe_dimension, glwe_dimension, polynomial_size, ks_base_log, ks_level, + pbs_base_log, pbs_level, grouping_factor, num_blocks); break; case 4096: host_full_propagate_inplace>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(input_blocks), - (int_fullprop_buffer *)mem_ptr, static_cast(ksk), - bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log, - ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks); + (int_fullprop_buffer *)mem_ptr, (uint64_t **)(ksks), bsks, + lwe_dimension, glwe_dimension, polynomial_size, ks_base_log, ks_level, + pbs_base_log, pbs_level, grouping_factor, num_blocks); break; case 8192: host_full_propagate_inplace>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(input_blocks), - (int_fullprop_buffer *)mem_ptr, static_cast(ksk), - bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log, - ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks); + (int_fullprop_buffer *)mem_ptr, (uint64_t **)(ksks), bsks, + lwe_dimension, glwe_dimension, polynomial_size, ks_base_log, ks_level, + pbs_base_log, pbs_level, grouping_factor, num_blocks); break; case 16384: host_full_propagate_inplace>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(input_blocks), - (int_fullprop_buffer *)mem_ptr, static_cast(ksk), - bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log, - ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks); + (int_fullprop_buffer *)mem_ptr, (uint64_t **)(ksks), bsks, + lwe_dimension, glwe_dimension, polynomial_size, ks_base_log, ks_level, + pbs_base_log, pbs_level, grouping_factor, num_blocks); break; default: PANIC("Cuda error (full propagation inplace): unsupported polynomial size. " @@ -116,12 +116,12 @@ void cleanup_cuda_full_propagation(void *stream, uint32_t gpu_index, } void scratch_cuda_propagate_single_carry_kb_64_inplace( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, bool allocate_gpu_memory) { + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, @@ -129,38 +129,37 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace( message_modulus, carry_modulus); scratch_cuda_propagate_single_carry_kb_inplace( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_sc_prop_memory **)mem_ptr, num_blocks, params, allocate_gpu_memory); } -void cuda_propagate_single_carry_kb_64_inplace(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - void *lwe_array, void *carry_out, - int8_t *mem_ptr, void *bsk, - void *ksk, uint32_t num_blocks) { +void cuda_propagate_single_carry_kb_64_inplace( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, + void *carry_out, int8_t *mem_ptr, void **bsks, void **ksks, + uint32_t num_blocks) { host_propagate_single_carry( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), static_cast(carry_out), - (int_sc_prop_memory *)mem_ptr, bsk, - static_cast(ksk), num_blocks); + (int_sc_prop_memory *)mem_ptr, bsks, (uint64_t **)(ksks), + num_blocks); } -void cleanup_cuda_propagate_single_carry(void *stream, uint32_t gpu_index, +void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void) { int_sc_prop_memory *mem_ptr = (int_sc_prop_memory *)(*mem_ptr_void); - mem_ptr->release(static_cast(stream), gpu_index); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } void scratch_cuda_apply_univariate_lut_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, void *input_lut, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, - uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks, - uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - bool allocate_gpu_memory) { + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, + PBS_TYPE pbs_type, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, glwe_dimension * polynomial_size, lwe_dimension, @@ -168,7 +167,7 @@ void scratch_cuda_apply_univariate_lut_kb_64( grouping_factor, message_modulus, carry_modulus); scratch_cuda_apply_univariate_lut_kb( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_radix_lut **)mem_ptr, static_cast(input_lut), num_radix_blocks, params, allocate_gpu_memory); } @@ -176,19 +175,21 @@ void scratch_cuda_apply_univariate_lut_kb_64( void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, - void *ksk, void *bsk, + void **ksks, void **bsks, uint32_t num_blocks) { host_apply_univariate_lut_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(output_radix_lwe), static_cast(input_radix_lwe), - (int_radix_lut *)mem_ptr, static_cast(ksk), bsk, + (int_radix_lut *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks); } -void cleanup_cuda_apply_univariate_lut_kb_64(void *stream, uint32_t gpu_index, +void cleanup_cuda_apply_univariate_lut_kb_64(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void) { int_radix_lut *mem_ptr = (int_radix_lut *)(*mem_ptr_void); - mem_ptr->release(static_cast(stream), gpu_index); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index f9178310f..e8c6fdca6 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -138,7 +138,7 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams, template __host__ void integer_radix_apply_univariate_lookup_table_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, void *bsk, Torus *ksk, + Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks, uint32_t num_radix_blocks, int_radix_lut *lut) { // apply_lookup_table auto params = lut->params; @@ -153,30 +153,38 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( auto polynomial_size = params.polynomial_size; auto grouping_factor = params.grouping_factor; - // Compute Keyswitch-PBS - cuda_keyswitch_lwe_ciphertext_vector( - streams[0], gpu_indexes[0], lut->tmp_lwe_after_ks, - lut->lwe_trivial_indexes, lwe_array_in, lut->lwe_indexes_in, ksk, - big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level, - num_radix_blocks); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + /// Apply KS to go from a big LWE dimension to a small LWE dimension + execute_keyswitch(streams, gpu_indexes, gpu_count, + lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, + lwe_array_in, lut->lwe_indexes_in, ksks, + big_lwe_dimension, small_lwe_dimension, ks_base_log, + ks_level, num_radix_blocks, false); - execute_pbs(streams, gpu_indexes, gpu_count, lwe_array_out, - lut->lwe_indexes_out, lut->lut, lut->lut_indexes, - lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, bsk, - lut->buffer, glwe_dimension, small_lwe_dimension, - polynomial_size, pbs_base_log, pbs_level, grouping_factor, - num_radix_blocks, 1, 0, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs( + streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out, + lut->lut, lut->lut_indexes, lut->tmp_lwe_after_ks, + lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension, + small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, + grouping_factor, num_radix_blocks, 1, 0, + cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); + + /// Synchronize all GPUs + auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); + for (uint i = 0; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } } template __host__ void integer_radix_apply_bivariate_lookup_table_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, void *bsk, - Torus *ksk, uint32_t num_radix_blocks, int_radix_lut *lut, + Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, void **bsks, + Torus **ksks, uint32_t num_radix_blocks, int_radix_lut *lut, uint32_t shift) { - cudaSetDevice(gpu_indexes[0]); - // apply_lookup_table_bivariate + auto params = lut->params; auto pbs_type = params.pbs_type; auto big_lwe_dimension = params.big_lwe_dimension; @@ -188,7 +196,6 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; auto grouping_factor = params.grouping_factor; - auto message_modulus = params.message_modulus; // Left message is shifted auto lwe_array_pbs_in = lut->tmp_lwe_before_ks; @@ -198,20 +205,30 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( num_radix_blocks); check_cuda_error(cudaGetLastError()); - // Apply LUT - cuda_keyswitch_lwe_ciphertext_vector( - streams[0], gpu_indexes[0], lut->tmp_lwe_after_ks, - lut->lwe_trivial_indexes, lwe_array_pbs_in, lut->lwe_trivial_indexes, ksk, - big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level, - num_radix_blocks); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); - execute_pbs(streams, gpu_indexes, gpu_count, lwe_array_out, - lut->lwe_indexes_out, lut->lut, lut->lut_indexes, - lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, bsk, - lut->buffer, glwe_dimension, small_lwe_dimension, - polynomial_size, pbs_base_log, pbs_level, grouping_factor, - num_radix_blocks, 1, 0, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); + /// Apply KS to go from a big LWE dimension to a small LWE dimension + execute_keyswitch(streams, gpu_indexes, gpu_count, + lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, + lwe_array_pbs_in, lut->lwe_indexes_in, ksks, + big_lwe_dimension, small_lwe_dimension, ks_base_log, + ks_level, num_radix_blocks, false); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs( + streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out, + lut->lut, lut->lut_indexes, lut->tmp_lwe_after_ks, + lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension, + small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, + grouping_factor, num_radix_blocks, 1, 0, + cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); + + /// Synchronize all GPUs + auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); + for (uint i = 0; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } } // Rotates the slice in-place such that the first mid elements of the slice move @@ -398,21 +415,21 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, template void scratch_cuda_propagate_single_carry_kb_inplace( - cudaStream_t stream, uint32_t gpu_index, + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_sc_prop_memory **mem_ptr, uint32_t num_radix_blocks, int_radix_params params, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); - *mem_ptr = new int_sc_prop_memory( - stream, gpu_index, params, num_radix_blocks, allocate_gpu_memory); + *mem_ptr = + new int_sc_prop_memory(streams, gpu_indexes, gpu_count, params, + num_radix_blocks, allocate_gpu_memory); } template void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array, Torus *carry_out, - int_sc_prop_memory *mem, void *bsk, - Torus *ksk, uint32_t num_blocks) { + int_sc_prop_memory *mem, void **bsks, + Torus **ksks, uint32_t num_blocks) { auto params = mem->params; auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; @@ -427,14 +444,13 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes, auto message_acc = mem->message_acc; integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsk, - ksk, num_blocks, luts_array); + streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsks, + ksks, num_blocks, luts_array); // compute prefix sum with hillis&steele int num_steps = ceil(log2((double)num_blocks)); int space = 1; - cudaSetDevice(gpu_indexes[0]); cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates, big_lwe_size_bytes * num_blocks, streams[0], gpu_indexes[0]); @@ -446,17 +462,16 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes, integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks, - bsk, ksk, cur_total_blocks, luts_carry_propagation_sum, + bsks, ksks, cur_total_blocks, luts_carry_propagation_sum, luts_carry_propagation_sum->params.message_modulus); - cudaSetDevice(gpu_indexes[0]); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); cuda_memcpy_async_gpu_to_gpu( &generates_or_propagates[space * big_lwe_size], cur_blocks, big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]); space *= 2; } - cudaSetDevice(gpu_indexes[0]); host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output, generates_or_propagates, 1, num_blocks, big_lwe_size); @@ -471,7 +486,7 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes, glwe_dimension * polynomial_size, num_blocks); integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsk, ksk, + streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks, num_blocks, message_acc); } @@ -480,9 +495,8 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *overflowed, Torus *lwe_array, int_single_borrow_prop_memory *mem, - void *bsk, Torus *ksk, + void **bsks, Torus **ksks, uint32_t num_blocks) { - cudaSetDevice(gpu_indexes[0]); auto params = mem->params; auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; @@ -497,8 +511,8 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams, auto message_acc = mem->message_acc; integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsk, - ksk, num_blocks, luts_array); + streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsks, + ksks, num_blocks, luts_array); // compute prefix sum with hillis&steele int num_steps = ceil(log2((double)num_blocks)); @@ -514,7 +528,7 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams, integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks, - bsk, ksk, cur_total_blocks, luts_carry_propagation_sum, + bsks, ksks, cur_total_blocks, luts_carry_propagation_sum, luts_carry_propagation_sum->params.message_modulus); cuda_memcpy_async_gpu_to_gpu( @@ -537,7 +551,7 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams, step_output, glwe_dimension * polynomial_size, num_blocks); integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsk, ksk, + streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks, num_blocks, message_acc); } @@ -550,26 +564,24 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams, * size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus) */ template -void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *input_blocks, - int_fullprop_buffer *mem_ptr, - Torus *ksk, void *bsk, uint32_t lwe_dimension, - uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t ks_base_log, - uint32_t ks_level, uint32_t pbs_base_log, - uint32_t pbs_level, uint32_t grouping_factor, - uint32_t num_blocks) { +void host_full_propagate_inplace( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + Torus *input_blocks, int_fullprop_buffer *mem_ptr, Torus **ksks, + void **bsks, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t ks_base_log, uint32_t ks_level, + uint32_t pbs_base_log, uint32_t pbs_level, uint32_t grouping_factor, + uint32_t num_blocks) { - cudaSetDevice(gpu_indexes[0]); int big_lwe_size = (glwe_dimension * polynomial_size + 1); int small_lwe_size = (lwe_dimension + 1); + cudaSetDevice(gpu_indexes[0]); for (int i = 0; i < num_blocks; i++) { auto cur_input_block = &input_blocks[i * big_lwe_size]; cuda_keyswitch_lwe_ciphertext_vector( streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector, - mem_ptr->lwe_indexes, cur_input_block, mem_ptr->lwe_indexes, ksk, + mem_ptr->lwe_indexes, cur_input_block, mem_ptr->lwe_indexes, ksks[0], polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level, 1); @@ -578,13 +590,16 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, small_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]); + std::vector pbs_buffer_vec; + pbs_buffer_vec.push_back(mem_ptr->pbs_buffer); execute_pbs( streams, gpu_indexes, 1, mem_ptr->tmp_big_lwe_vector, mem_ptr->lwe_indexes, mem_ptr->lut_buffer, mem_ptr->lut_indexes, - mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes, bsk, - mem_ptr->pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, + mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes, bsks, + pbs_buffer_vec, glwe_dimension, lwe_dimension, polynomial_size, pbs_base_log, pbs_level, grouping_factor, 2, 2, 0, cuda_get_max_shared_memory(gpu_indexes[0]), mem_ptr->pbs_type); + pbs_buffer_vec.clear(); cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector, big_lwe_size * sizeof(Torus), streams[0], @@ -805,22 +820,22 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index, template __host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, - Torus *lwe_array_in, void *bsk, Torus *ksk, + Torus *lwe_array_in, void **bsks, Torus **ksks, uint32_t num_radix_blocks, uint32_t bits_per_block, int_bit_extract_luts_buffer *bit_extract) { integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsk, ksk, + streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks, num_radix_blocks * bits_per_block, bit_extract->lut); } template -__host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *signs_array_out, - Torus *signs_array_in, - int_comparison_buffer *mem_ptr, - std::function sign_handler_f, - void *bsk, Torus *ksk, uint32_t num_sign_blocks) { +__host__ void +reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + Torus *signs_array_out, Torus *signs_array_in, + int_comparison_buffer *mem_ptr, + std::function sign_handler_f, void **bsks, + Torus **ksks, uint32_t num_sign_blocks) { cudaSetDevice(gpu_indexes[0]); auto diff_buffer = mem_ptr->diff_buffer; @@ -857,7 +872,7 @@ __host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension, num_sign_blocks, 4); integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, signs_a, signs_b, bsk, ksk, + streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks, num_sign_blocks / 2, lut); auto last_block_signs_b = @@ -889,7 +904,7 @@ __host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, 2, 4); integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes, gpu_count, signs_array_out, - signs_b, bsk, ksk, 1, lut); + signs_b, bsks, ksks, 1, lut); } else { @@ -905,34 +920,35 @@ __host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes, gpu_count, signs_array_out, - signs_a, bsk, ksk, 1, lut); + signs_a, bsks, ksks, 1, lut); } } template void scratch_cuda_apply_univariate_lut_kb( - cudaStream_t stream, uint32_t gpu_index, int_radix_lut **mem_ptr, - Torus *input_lut, uint32_t num_radix_blocks, int_radix_params params, - bool allocate_gpu_memory) { + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int_radix_lut **mem_ptr, Torus *input_lut, uint32_t num_radix_blocks, + int_radix_params params, bool allocate_gpu_memory) { - *mem_ptr = new int_radix_lut(stream, gpu_index, params, 1, - num_radix_blocks, allocate_gpu_memory); + *mem_ptr = new int_radix_lut(streams, gpu_indexes, gpu_count, params, + 1, num_radix_blocks, allocate_gpu_memory); + // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU + // 0 cuda_memcpy_async_to_gpu((*mem_ptr)->lut, input_lut, (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus), - stream, gpu_index); + streams[0], gpu_indexes[0]); } template void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *radix_lwe_out, Torus *radix_lwe_in, - int_radix_lut *mem, Torus *ksk, - void *bsk, uint32_t num_blocks) { + int_radix_lut *mem, Torus **ksks, + void **bsks, uint32_t num_blocks) { - cudaSetDevice(gpu_indexes[0]); integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsk, ksk, + streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks, num_blocks, mem); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu index 965b42942..f19971047 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu @@ -66,7 +66,7 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in, * the integer radix multiplication in keyswitch->bootstrap order. */ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level, @@ -87,7 +87,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( case 8192: case 16384: scratch_cuda_integer_mult_radix_ciphertext_kb( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_mul_memory **)mem_ptr, num_radix_blocks, params, allocate_gpu_memory); break; @@ -127,8 +127,9 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( */ void cuda_integer_mult_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right, void *bsk, - void *ksk, int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) { + void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right, + void **bsks, void **ksks, int8_t *mem_ptr, uint32_t polynomial_size, + uint32_t num_blocks) { switch (polynomial_size) { case 256: @@ -136,63 +137,56 @@ void cuda_integer_mult_radix_ciphertext_kb_64( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsk, - static_cast(ksk), (int_mul_memory *)mem_ptr, - num_blocks); + static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), + (int_mul_memory *)mem_ptr, num_blocks); break; case 512: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsk, - static_cast(ksk), (int_mul_memory *)mem_ptr, - num_blocks); + static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), + (int_mul_memory *)mem_ptr, num_blocks); break; case 1024: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsk, - static_cast(ksk), (int_mul_memory *)mem_ptr, - num_blocks); + static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), + (int_mul_memory *)mem_ptr, num_blocks); break; case 2048: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsk, - static_cast(ksk), (int_mul_memory *)mem_ptr, - num_blocks); + static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), + (int_mul_memory *)mem_ptr, num_blocks); break; case 4096: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsk, - static_cast(ksk), (int_mul_memory *)mem_ptr, - num_blocks); + static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), + (int_mul_memory *)mem_ptr, num_blocks); break; case 8192: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsk, - static_cast(ksk), (int_mul_memory *)mem_ptr, - num_blocks); + static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), + (int_mul_memory *)mem_ptr, num_blocks); break; case 16384: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsk, - static_cast(ksk), (int_mul_memory *)mem_ptr, - num_blocks); + static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), + (int_mul_memory *)mem_ptr, num_blocks); break; default: PANIC("Cuda error (integer multiplication): unsupported polynomial size. " @@ -200,29 +194,30 @@ void cuda_integer_mult_radix_ciphertext_kb_64( } } -void cleanup_cuda_integer_mult(void *stream, uint32_t gpu_index, - int8_t **mem_ptr_void) { +void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void) { int_mul_memory *mem_ptr = (int_mul_memory *)(*mem_ptr_void); - mem_ptr->release(static_cast(stream), gpu_index); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks_in_radix, - uint32_t max_num_radix_in_vec, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) { + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension, + uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, + uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, glwe_dimension * polynomial_size, lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, grouping_factor, message_modulus, carry_modulus); scratch_cuda_integer_sum_ciphertexts_vec_kb( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_sum_ciphertexts_vec_memory **)mem_ptr, num_blocks_in_radix, max_num_radix_in_vec, params, allocate_gpu_memory); } @@ -230,7 +225,7 @@ void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64( void cuda_integer_radix_sum_ciphertexts_vec_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec, - int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks_in_radix) { + int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix) { auto mem = (int_sum_ciphertexts_vec_memory *)mem_ptr; @@ -246,49 +241,43 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64( host_integer_sum_ciphertexts_vec_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_vec), terms_degree, bsk, - static_cast(ksk), mem, num_blocks_in_radix, - num_radix_in_vec); + static_cast(radix_lwe_vec), terms_degree, bsks, + (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec); break; case 1024: host_integer_sum_ciphertexts_vec_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_vec), terms_degree, bsk, - static_cast(ksk), mem, num_blocks_in_radix, - num_radix_in_vec); + static_cast(radix_lwe_vec), terms_degree, bsks, + (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec); break; case 2048: host_integer_sum_ciphertexts_vec_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_vec), terms_degree, bsk, - static_cast(ksk), mem, num_blocks_in_radix, - num_radix_in_vec); + static_cast(radix_lwe_vec), terms_degree, bsks, + (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec); break; case 4096: host_integer_sum_ciphertexts_vec_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_vec), terms_degree, bsk, - static_cast(ksk), mem, num_blocks_in_radix, - num_radix_in_vec); + static_cast(radix_lwe_vec), terms_degree, bsks, + (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec); break; case 8192: host_integer_sum_ciphertexts_vec_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_vec), terms_degree, bsk, - static_cast(ksk), mem, num_blocks_in_radix, - num_radix_in_vec); + static_cast(radix_lwe_vec), terms_degree, bsks, + (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec); break; case 16384: host_integer_sum_ciphertexts_vec_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_vec), terms_degree, bsk, - static_cast(ksk), mem, num_blocks_in_radix, - num_radix_in_vec); + static_cast(radix_lwe_vec), terms_degree, bsks, + (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec); break; default: PANIC("Cuda error (integer multiplication): unsupported polynomial size. " @@ -298,11 +287,12 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64( free(terms_degree); } -void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void *stream, - uint32_t gpu_index, +void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void) { int_sum_ciphertexts_vec_memory *mem_ptr = (int_sum_ciphertexts_vec_memory *)(*mem_ptr_void); - mem_ptr->release(static_cast(stream), gpu_index); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 027b61c89..a860d60fa 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -181,14 +181,13 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks, } template __host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb( - cudaStream_t stream, uint32_t gpu_index, + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_sum_ciphertexts_vec_memory **mem_ptr, uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec, int_radix_params params, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus); - if (sm_size < cuda_get_max_shared_memory(gpu_index)) { + if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) { check_cuda_error(cudaFuncSetAttribute( tree_add_chunks, cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size)); @@ -203,18 +202,17 @@ __host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb( check_cuda_error(cudaGetLastError()); } *mem_ptr = new int_sum_ciphertexts_vec_memory( - stream, gpu_index, params, num_blocks_in_radix, max_num_radix_in_vec, - allocate_gpu_memory); + streams, gpu_indexes, gpu_count, params, num_blocks_in_radix, + max_num_radix_in_vec, allocate_gpu_memory); } template __host__ void host_integer_sum_ciphertexts_vec_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *radix_lwe_out, Torus *terms, int *terms_degree, void *bsk, - uint64_t *ksk, int_sum_ciphertexts_vec_memory *mem_ptr, + Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks, + uint64_t **ksks, int_sum_ciphertexts_vec_memory *mem_ptr, uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec) { - cudaSetDevice(gpu_indexes[0]); auto new_blocks = mem_ptr->new_blocks; auto old_blocks = mem_ptr->old_blocks; auto small_lwe_vector = mem_ptr->small_lwe_vector; @@ -258,6 +256,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( dim3 add_grid(ch_amount, num_blocks, 1); size_t sm_size = big_lwe_size * sizeof(Torus); + cudaSetDevice(gpu_indexes[0]); if (sm_size < max_shared_memory) tree_add_chunks<<>>( new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks); @@ -281,7 +280,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( // we allocate luts_message_carry in the host function (instead of scratch) // to reduce average memory consumption auto luts_message_carry = new int_radix_lut( - streams[0], gpu_indexes[0], mem_ptr->params, 2, total_count, true); + streams, gpu_indexes, gpu_count, mem_ptr->params, 2, total_count, true); auto message_acc = luts_message_carry->get_lut(0); auto carry_acc = luts_message_carry->get_lut(1); @@ -326,22 +325,35 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( streams[0], gpu_indexes[0], luts_message_carry->get_lut_indexes(message_count), 1, carry_count); - cuda_keyswitch_lwe_ciphertext_vector( - streams[0], gpu_indexes[0], small_lwe_vector, lwe_indexes_in, - new_blocks, lwe_indexes_in, ksk, polynomial_size * glwe_dimension, - lwe_dimension, mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, - message_count); + auto active_gpu_count = get_active_gpu_count(total_count, gpu_count); + for (uint i = 0; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } + /// Apply KS to go from a big LWE dimension to a small LWE dimension + /// After this keyswitch execution, we need to synchronize the streams + /// because the keyswitch and PBS do not operate on the same number of + /// inputs + execute_keyswitch(streams, gpu_indexes, gpu_count, small_lwe_vector, + lwe_indexes_in, new_blocks, lwe_indexes_in, ksks, + polynomial_size * glwe_dimension, lwe_dimension, + mem_ptr->params.ks_base_log, + mem_ptr->params.ks_level, message_count, true); + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension execute_pbs(streams, gpu_indexes, gpu_count, new_blocks, lwe_indexes_out, luts_message_carry->lut, luts_message_carry->lut_indexes, small_lwe_vector, - lwe_indexes_in, bsk, luts_message_carry->buffer, + lwe_indexes_in, bsks, luts_message_carry->buffer, glwe_dimension, lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor, total_count, 2, 0, - max_shared_memory, mem_ptr->params.pbs_type); - - luts_message_carry->release(streams[0], gpu_indexes[0]); + max_shared_memory, mem_ptr->params.pbs_type, false); + /// Synchronize all GPUs + for (uint i = 0; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } + luts_message_carry->release(streams, gpu_indexes, gpu_count); int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0; int new_blocks_created = 2 * ch_amount * num_blocks; @@ -361,17 +373,16 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( host_propagate_single_carry(streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, mem_ptr->scp_mem, - bsk, ksk, num_blocks); + bsks, ksks, num_blocks); } template __host__ void host_integer_mult_radix_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, uint64_t *radix_lwe_out, uint64_t *radix_lwe_left, - uint64_t *radix_lwe_right, void *bsk, uint64_t *ksk, + uint64_t *radix_lwe_right, void **bsks, uint64_t **ksks, int_mul_memory *mem_ptr, uint32_t num_blocks) { - cudaSetDevice(gpu_indexes[0]); auto glwe_dimension = mem_ptr->params.glwe_dimension; auto polynomial_size = mem_ptr->params.polynomial_size; auto lwe_dimension = mem_ptr->params.small_lwe_dimension; @@ -438,6 +449,7 @@ __host__ void host_integer_mult_radix_kb( dim3 grid(lsb_vector_block_count, 1, 1); dim3 thds(params::degree / params::opt, 1, 1); + cudaSetDevice(gpu_indexes[0]); all_shifted_lhs_rhs<<>>( radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right, vector_lsb_rhs, vector_msb_rhs, num_blocks); @@ -445,13 +457,14 @@ __host__ void host_integer_mult_radix_kb( integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, block_mul_res, block_mul_res, - vector_result_sb, bsk, ksk, total_block_count, luts_array, + vector_result_sb, bsks, ksks, total_block_count, luts_array, luts_array->params.message_modulus); vector_result_lsb = &block_mul_res[0]; vector_result_msb = &block_mul_res[lsb_vector_block_count * (polynomial_size * glwe_dimension + 1)]; + cudaSetDevice(gpu_indexes[0]); fill_radix_from_lsb_msb <<>>(vector_result_sb, vector_result_lsb, vector_result_msb, @@ -474,18 +487,17 @@ __host__ void host_integer_mult_radix_kb( host_integer_sum_ciphertexts_vec_kb( streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, - terms_degree, bsk, ksk, mem_ptr->sum_ciphertexts_mem, num_blocks, + terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks); } template __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb( - cudaStream_t stream, uint32_t gpu_index, int_mul_memory **mem_ptr, - uint32_t num_radix_blocks, int_radix_params params, - bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int_mul_memory **mem_ptr, uint32_t num_radix_blocks, + int_radix_params params, bool allocate_gpu_memory) { size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus); - if (sm_size < cuda_get_max_shared_memory(gpu_index)) { + if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) { check_cuda_error(cudaFuncSetAttribute( tree_add_chunks, cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size)); @@ -500,7 +512,7 @@ __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb( check_cuda_error(cudaGetLastError()); } - *mem_ptr = new int_mul_memory(stream, gpu_index, params, + *mem_ptr = new int_mul_memory(streams, gpu_indexes, gpu_count, params, num_radix_blocks, allocate_gpu_memory); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu index 7e2032fb6..3f05a78f2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu @@ -12,12 +12,12 @@ void cuda_negate_integer_radix_ciphertext_64_inplace( } void scratch_cuda_integer_radix_overflowing_sub_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, bool allocate_gpu_memory) { + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, @@ -25,7 +25,7 @@ void scratch_cuda_integer_radix_overflowing_sub_kb_64( message_modulus, carry_modulus); scratch_cuda_integer_overflowing_sub_kb( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_overflowing_sub_memory **)mem_ptr, num_blocks, params, allocate_gpu_memory); } @@ -33,7 +33,7 @@ void scratch_cuda_integer_radix_overflowing_sub_kb_64( void cuda_integer_radix_overflowing_sub_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *radix_lwe_out, void *radix_lwe_overflowed, void *radix_lwe_left, - void *radix_lwe_right, int8_t *mem_ptr, void *bsk, void *ksk, + void *radix_lwe_right, int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks) { auto mem = (int_overflowing_sub_memory *)mem_ptr; @@ -45,8 +45,8 @@ void cuda_integer_radix_overflowing_sub_kb_64( static_cast(radix_lwe_out), static_cast(radix_lwe_overflowed), static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsk, - static_cast(ksk), mem, num_blocks); + static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), + mem, num_blocks); break; case 1024: host_integer_overflowing_sub_kb>( @@ -54,8 +54,8 @@ void cuda_integer_radix_overflowing_sub_kb_64( static_cast(radix_lwe_out), static_cast(radix_lwe_overflowed), static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsk, - static_cast(ksk), mem, num_blocks); + static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), + mem, num_blocks); break; case 2048: host_integer_overflowing_sub_kb>( @@ -63,8 +63,8 @@ void cuda_integer_radix_overflowing_sub_kb_64( static_cast(radix_lwe_out), static_cast(radix_lwe_overflowed), static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsk, - static_cast(ksk), mem, num_blocks); + static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), + mem, num_blocks); break; case 4096: host_integer_overflowing_sub_kb>( @@ -72,8 +72,8 @@ void cuda_integer_radix_overflowing_sub_kb_64( static_cast(radix_lwe_out), static_cast(radix_lwe_overflowed), static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsk, - static_cast(ksk), mem, num_blocks); + static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), + mem, num_blocks); break; case 8192: host_integer_overflowing_sub_kb>( @@ -81,8 +81,8 @@ void cuda_integer_radix_overflowing_sub_kb_64( static_cast(radix_lwe_out), static_cast(radix_lwe_overflowed), static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsk, - static_cast(ksk), mem, num_blocks); + static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), + mem, num_blocks); break; case 16384: host_integer_overflowing_sub_kb>( @@ -90,8 +90,8 @@ void cuda_integer_radix_overflowing_sub_kb_64( static_cast(radix_lwe_out), static_cast(radix_lwe_overflowed), static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsk, - static_cast(ksk), mem, num_blocks); + static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), + mem, num_blocks); break; default: PANIC("Cuda error (integer overflowing sub): unsupported polynomial size. " @@ -99,11 +99,12 @@ void cuda_integer_radix_overflowing_sub_kb_64( } } -void cleanup_cuda_integer_radix_overflowing_sub(void *stream, - uint32_t gpu_index, +void cleanup_cuda_integer_radix_overflowing_sub(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void) { int_overflowing_sub_memory *mem_ptr = (int_overflowing_sub_memory *)(*mem_ptr_void); - mem_ptr->release(static_cast(stream), gpu_index); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh index ac49c4f92..5e12ad55c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh @@ -90,20 +90,19 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes, template __host__ void scratch_cuda_integer_overflowing_sub_kb( - cudaStream_t stream, uint32_t gpu_index, + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_overflowing_sub_memory **mem_ptr, uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); *mem_ptr = new int_overflowing_sub_memory( - stream, gpu_index, params, num_blocks, allocate_gpu_memory); + streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory); } template __host__ void host_integer_overflowing_sub_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *radix_lwe_out, Torus *radix_lwe_overflowed, Torus *radix_lwe_left, - Torus *radix_lwe_right, void *bsk, uint64_t *ksk, + Torus *radix_lwe_right, void **bsks, uint64_t **ksks, int_overflowing_sub_memory *mem_ptr, uint32_t num_blocks) { auto radix_params = mem_ptr->params; @@ -116,7 +115,7 @@ __host__ void host_integer_overflowing_sub_kb( host_propagate_single_sub_borrow( streams, gpu_indexes, gpu_count, radix_lwe_overflowed, radix_lwe_out, - mem_ptr->borrow_prop_mem, bsk, ksk, num_blocks); + mem_ptr->borrow_prop_mem, bsks, ksks, num_blocks); } #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu index b2eac090e..2a94cc99b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu @@ -3,7 +3,7 @@ void cuda_scalar_bitop_integer_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array_out, void *lwe_array_input, void *clear_blocks, - uint32_t num_clear_blocks, int8_t *mem_ptr, void *bsk, void *ksk, + uint32_t num_clear_blocks, int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op) { host_integer_radix_scalar_bitop_kb( @@ -11,6 +11,6 @@ void cuda_scalar_bitop_integer_radix_ciphertext_kb_64( static_cast(lwe_array_out), static_cast(lwe_array_input), static_cast(clear_blocks), num_clear_blocks, - (int_bitop_buffer *)mem_ptr, bsk, static_cast(ksk), + (int_bitop_buffer *)mem_ptr, bsks, (uint64_t **)(ksks), lwe_ciphertext_count, op); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh index e1670c7fc..ffaa021a8 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh @@ -8,10 +8,9 @@ template __host__ void host_integer_radix_scalar_bitop_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_input, Torus *clear_blocks, - uint32_t num_clear_blocks, int_bitop_buffer *mem_ptr, void *bsk, - Torus *ksk, uint32_t num_radix_blocks, BITOP_TYPE op) { + uint32_t num_clear_blocks, int_bitop_buffer *mem_ptr, void **bsks, + Torus **ksks, uint32_t num_radix_blocks, BITOP_TYPE op) { - cudaSetDevice(gpu_indexes[0]); auto lut = mem_ptr->lut; auto params = lut->params; auto big_lwe_dimension = params.big_lwe_dimension; @@ -36,8 +35,8 @@ __host__ void host_integer_radix_scalar_bitop_kb( gpu_indexes[0]); integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsk, - ksk, num_clear_blocks, lut); + streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsks, + ksks, num_clear_blocks, lut); if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) { auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu index 1223d5b2f..9334be96e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu @@ -3,7 +3,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array_out, void *lwe_array_in, void *scalar_blocks, - int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count, + int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks) { int_comparison_buffer *buffer = @@ -15,8 +15,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), static_cast(lwe_array_in), - static_cast(scalar_blocks), buffer, bsk, - static_cast(ksk), lwe_ciphertext_count, num_scalar_blocks); + static_cast(scalar_blocks), buffer, bsks, + (uint64_t **)(ksks), lwe_ciphertext_count, num_scalar_blocks); break; case GT: case GE: @@ -27,7 +27,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64( static_cast(lwe_array_out), static_cast(lwe_array_in), static_cast(scalar_blocks), buffer, - buffer->diff_buffer->operator_f, bsk, static_cast(ksk), + buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks), lwe_ciphertext_count, num_scalar_blocks); break; case MAX: @@ -36,8 +36,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), static_cast(lwe_array_in), - static_cast(scalar_blocks), buffer, bsk, - static_cast(ksk), lwe_ciphertext_count, num_scalar_blocks); + static_cast(scalar_blocks), buffer, bsks, + (uint64_t **)(ksks), lwe_ciphertext_count, num_scalar_blocks); break; default: PANIC("Cuda error: integer operation not supported") diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh index 87905c9a2..15272a94a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh @@ -9,10 +9,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, int_comparison_buffer *mem_ptr, - std::function sign_handler_f, void *bsk, Torus *ksk, + std::function sign_handler_f, void **bsks, Torus **ksks, uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { - cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; auto glwe_dimension = params.glwe_dimension; @@ -47,10 +46,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( if (total_num_scalar_blocks == 0) { // We only have to compare blocks with zero // means scalar is zero - host_compare_with_zero_equality(streams, gpu_indexes, gpu_count, - mem_ptr->tmp_lwe_array_out, lwe_array_in, - mem_ptr, bsk, ksk, total_num_radix_blocks, - mem_ptr->is_zero_lut); + host_compare_with_zero_equality( + streams, gpu_indexes, 1, mem_ptr->tmp_lwe_array_out, lwe_array_in, + mem_ptr, bsks, ksks, total_num_radix_blocks, mem_ptr->is_zero_lut); auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus { x = (x == 1 ? IS_EQUAL : IS_SUPERIOR); @@ -65,7 +63,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, - mem_ptr->tmp_lwe_array_out, bsk, ksk, 1, lut); + mem_ptr->tmp_lwe_array_out, bsks, ksks, 1, lut); } else if (total_num_scalar_blocks < total_num_radix_blocks) { // We have to handle both part of the work described above @@ -109,15 +107,15 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( auto comparisons = mem_ptr->tmp_block_comparisons; scalar_compare_radix_blocks_kb(&lsb_stream, &gpu_indexes[0], 1, - comparisons, lhs, rhs, mem_ptr, bsk, ksk, - num_lsb_radix_blocks); + comparisons, lhs, rhs, mem_ptr, bsks, + ksks, num_lsb_radix_blocks); // Reduces a vec containing radix blocks that encrypts a sign // (inferior, equal, superior) to one single radix block containing the // final sign tree_sign_reduction(&lsb_stream, &gpu_indexes[0], 1, lwe_array_lsb_out, comparisons, mem_ptr->diff_buffer->tree_buffer, - mem_ptr->identity_lut_f, bsk, ksk, + mem_ptr->identity_lut_f, bsks, ksks, num_lsb_radix_blocks); } #pragma omp section @@ -126,7 +124,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( // msb host_compare_with_zero_equality( &msb_stream, &gpu_indexes[0], 1, lwe_array_msb_out, msb, mem_ptr, - bsk, ksk, num_msb_radix_blocks, mem_ptr->is_zero_lut); + bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut); } } cuda_synchronize_stream(lsb_stream, gpu_indexes[0]); @@ -150,7 +148,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out, - lwe_array_msb_out, bsk, ksk, 1, lut, lut->params.message_modulus); + lwe_array_msb_out, bsks, ksks, 1, lut, lut->params.message_modulus); } else { // We only have to do the regular comparison @@ -177,7 +175,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( // - 2 if lhs > rhs auto comparisons = mem_ptr->tmp_lwe_array_out; scalar_compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, - lhs, rhs, mem_ptr, bsk, ksk, + lhs, rhs, mem_ptr, bsks, ksks, num_lsb_radix_blocks); // Reduces a vec containing radix blocks that encrypts a sign @@ -185,7 +183,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( // final sign tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out, comparisons, mem_ptr->diff_buffer->tree_buffer, - sign_handler_f, bsk, ksk, num_lsb_radix_blocks); + sign_handler_f, bsks, ksks, num_lsb_radix_blocks); } } @@ -194,7 +192,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, int_comparison_buffer *mem_ptr, - std::function sign_handler_f, void *bsk, Torus *ksk, + std::function sign_handler_f, void **bsks, Torus **ksks, uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { cudaSetDevice(gpu_indexes[0]); @@ -234,8 +232,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( // means scalar is zero Torus *are_all_msb_zeros = mem_ptr->tmp_lwe_array_out; host_compare_with_zero_equality( - streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in, - mem_ptr, bsk, ksk, total_num_radix_blocks, mem_ptr->is_zero_lut); + streams, gpu_indexes, 1, are_all_msb_zeros, lwe_array_in, mem_ptr, bsks, + ksks, total_num_radix_blocks, mem_ptr->is_zero_lut); Torus *sign_block = lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size; @@ -281,7 +279,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros, - sign_block, bsk, ksk, 1, lut, lut->params.message_modulus); + sign_block, bsks, ksks, 1, lut, lut->params.message_modulus); } else if (total_num_scalar_blocks < total_num_radix_blocks) { // We have to handle both part of the work described above @@ -325,15 +323,15 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( auto comparisons = mem_ptr->tmp_block_comparisons; scalar_compare_radix_blocks_kb(&lsb_stream, &gpu_indexes[0], 1, - comparisons, lhs, rhs, mem_ptr, bsk, ksk, - num_lsb_radix_blocks); + comparisons, lhs, rhs, mem_ptr, bsks, + ksks, num_lsb_radix_blocks); // Reduces a vec containing radix blocks that encrypts a sign // (inferior, equal, superior) to one single radix block containing the // final sign tree_sign_reduction(&lsb_stream, &gpu_indexes[0], 1, lwe_array_lsb_out, comparisons, mem_ptr->diff_buffer->tree_buffer, - mem_ptr->identity_lut_f, bsk, ksk, + mem_ptr->identity_lut_f, bsks, ksks, num_lsb_radix_blocks); } #pragma omp section @@ -344,7 +342,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( Torus *are_all_msb_zeros = lwe_array_msb_out; host_compare_with_zero_equality( &msb_stream, &gpu_indexes[0], 1, are_all_msb_zeros, msb, mem_ptr, - bsk, ksk, num_msb_radix_blocks, mem_ptr->is_zero_lut); + bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut); auto sign_bit_pos = (int)log2(message_modulus) - 1; @@ -379,7 +377,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size; integer_radix_apply_bivariate_lookup_table_kb( &msb_stream, &gpu_indexes[0], 1, lwe_array_msb_out, sign_block, - are_all_msb_zeros, bsk, ksk, 1, signed_msb_lut, + are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut, signed_msb_lut->params.message_modulus); } } @@ -389,7 +387,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( ////////////// // Reduce the two blocks into one final reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out, - lwe_array_lsb_out, mem_ptr, sign_handler_f, bsk, ksk, 2); + lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks, 2); } else { // We only have to do the regular comparison @@ -426,8 +424,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( // - 1 if lhs == rhs // - 2 if lhs > rhs scalar_compare_radix_blocks_kb(&lsb_stream, &gpu_indexes[0], 1, - lwe_array_ct_out, lhs, rhs, mem_ptr, bsk, - ksk, num_lsb_radix_blocks); + lwe_array_ct_out, lhs, rhs, mem_ptr, + bsks, ksks, num_lsb_radix_blocks); } #pragma omp section { @@ -443,7 +441,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( integer_radix_apply_bivariate_lookup_table_kb( &msb_stream, &gpu_indexes[0], 1, lwe_array_sign_out, - encrypted_sign_block, trivial_sign_block, bsk, ksk, 1, + encrypted_sign_block, trivial_sign_block, bsks, ksks, 1, mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus); } } @@ -454,7 +452,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( // (inferior, equal, superior) to one single radix block containing the // final sign reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out, - lwe_array_ct_out, mem_ptr, sign_handler_f, bsk, ksk, + lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks, num_lsb_radix_blocks + 1); } } @@ -463,7 +461,7 @@ template __host__ void integer_radix_signed_scalar_maxmin_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, - int_comparison_buffer *mem_ptr, void *bsk, Torus *ksk, + int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { cudaSetDevice(gpu_indexes[0]); @@ -475,7 +473,7 @@ __host__ void integer_radix_signed_scalar_maxmin_kb( auto sign = mem_ptr->tmp_lwe_array_out; integer_radix_signed_scalar_difference_check_kb( streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks, - mem_ptr, mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks, + mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks, total_num_scalar_blocks); // There is no optimized CMUX for scalars, so we convert to a trivial @@ -490,9 +488,10 @@ __host__ void integer_radix_signed_scalar_maxmin_kb( // Selector // CMUX for Max or Min - host_integer_radix_cmux_kb( - streams, gpu_indexes, gpu_count, lwe_array_out, sign, lwe_array_left, - lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks); + host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out, + sign, lwe_array_left, lwe_array_right, + mem_ptr->cmux_buffer, bsks, ksks, + total_num_radix_blocks); } template @@ -500,19 +499,19 @@ __host__ void host_integer_radix_scalar_difference_check_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, int_comparison_buffer *mem_ptr, - std::function sign_handler_f, void *bsk, Torus *ksk, + std::function sign_handler_f, void **bsks, Torus **ksks, uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { if (mem_ptr->is_signed) { // is signed and scalar is positive integer_radix_signed_scalar_difference_check_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, - scalar_blocks, mem_ptr, sign_handler_f, bsk, ksk, + scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks, total_num_radix_blocks, total_num_scalar_blocks); } else { integer_radix_unsigned_scalar_difference_check_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, - scalar_blocks, mem_ptr, sign_handler_f, bsk, ksk, + scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks, total_num_radix_blocks, total_num_scalar_blocks); } } @@ -521,32 +520,30 @@ template __host__ void host_integer_radix_signed_scalar_maxmin_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, - int_comparison_buffer *mem_ptr, void *bsk, Torus *ksk, + int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { if (mem_ptr->is_signed) { // is signed and scalar is positive integer_radix_signed_scalar_maxmin_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, - scalar_blocks, mem_ptr, bsk, ksk, total_num_radix_blocks, + scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks, total_num_scalar_blocks); } else { integer_radix_unsigned_scalar_maxmin_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, - scalar_blocks, mem_ptr, bsk, ksk, total_num_radix_blocks, + scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks, total_num_scalar_blocks); } } template -__host__ void -scalar_compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *lwe_array_out, - Torus *lwe_array_in, Torus *scalar_blocks, - int_comparison_buffer *mem_ptr, void *bsk, - Torus *ksk, uint32_t num_radix_blocks) { +__host__ void scalar_compare_radix_blocks_kb( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, + int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, + uint32_t num_radix_blocks) { - cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; auto message_modulus = params.message_modulus; @@ -579,8 +576,8 @@ scalar_compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // Apply LUT to compare to 0 auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut; integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsk, - ksk, num_radix_blocks, sign_lut); + streams, gpu_indexes, 1, lwe_array_out, subtracted_blocks, bsks, ksks, + num_radix_blocks, sign_lut); // Add one // Here Lhs can have the following values: (-1) % (message modulus * carry @@ -594,10 +591,9 @@ template __host__ void host_integer_radix_scalar_maxmin_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, - int_comparison_buffer *mem_ptr, void *bsk, Torus *ksk, + int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { - cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; // Calculates the difference sign between the ciphertext and the scalar @@ -607,7 +603,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb( auto sign = mem_ptr->tmp_lwe_array_out; host_integer_radix_scalar_difference_check_kb( streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks, - mem_ptr, mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks, + mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks, total_num_scalar_blocks); // There is no optimized CMUX for scalars, so we convert to a trivial @@ -624,7 +620,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb( // CMUX for Max or Min host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left, - lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, + lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks); } @@ -632,10 +628,9 @@ template __host__ void host_integer_radix_scalar_equality_check_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, - int_comparison_buffer *mem_ptr, void *bsk, Torus *ksk, + int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) { - cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; auto message_modulus = params.message_modulus; @@ -689,7 +684,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb( integer_radix_apply_univariate_lookup_table_kb( &lsb_stream, &gpu_indexes[0], 1, lwe_array_lsb_out, packed_blocks, - bsk, ksk, num_halved_lsb_radix_blocks, scalar_comparison_luts); + bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts); } } #pragma omp section @@ -710,8 +705,8 @@ __host__ void host_integer_radix_scalar_equality_check_kb( } host_compare_with_zero_equality(&msb_stream, &gpu_indexes[0], 1, - lwe_array_msb_out, msb, mem_ptr, bsk, - ksk, num_msb_radix_blocks, msb_lut); + lwe_array_msb_out, msb, mem_ptr, bsks, + ksks, num_msb_radix_blocks, msb_lut); } } } @@ -723,13 +718,13 @@ __host__ void host_integer_radix_scalar_equality_check_kb( case COMPARISON_TYPE::EQ: are_all_comparisons_block_true( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out, - mem_ptr, bsk, ksk, + mem_ptr, bsks, ksks, num_halved_scalar_blocks + (num_msb_radix_blocks > 0)); break; case COMPARISON_TYPE::NE: is_at_least_one_comparisons_block_true( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out, - mem_ptr, bsk, ksk, + mem_ptr, bsks, ksks, num_halved_scalar_blocks + (num_msb_radix_blocks > 0)); break; default: diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu index c71ad0b21..c17d58519 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu @@ -1,11 +1,12 @@ #include "integer/scalar_mul.cuh" void scratch_cuda_integer_scalar_mul_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) { + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension, + uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, + uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, glwe_dimension * polynomial_size, lwe_dimension, @@ -13,7 +14,7 @@ void scratch_cuda_integer_scalar_mul_kb_64( grouping_factor, message_modulus, carry_modulus); scratch_cuda_integer_radix_scalar_mul_kb( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_scalar_mul_buffer **)mem_ptr, num_blocks, params, allocate_gpu_memory); } @@ -21,7 +22,7 @@ void scratch_cuda_integer_scalar_mul_kb_64( void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, uint64_t *decomposed_scalar, uint64_t *has_at_least_one_set, int8_t *mem, - void *bsk, void *ksk, uint32_t lwe_dimension, uint32_t polynomial_size, + void **bsks, void **ksks, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars) { switch (polynomial_size) { @@ -30,54 +31,54 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), decomposed_scalar, has_at_least_one_set, - reinterpret_cast *>(mem), bsk, - static_cast(ksk), lwe_dimension, message_modulus, - num_blocks, num_scalars); + reinterpret_cast *>(mem), bsks, + (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks, + num_scalars); break; case 1024: host_integer_scalar_mul_radix>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), decomposed_scalar, has_at_least_one_set, - reinterpret_cast *>(mem), bsk, - static_cast(ksk), lwe_dimension, message_modulus, - num_blocks, num_scalars); + reinterpret_cast *>(mem), bsks, + (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks, + num_scalars); break; case 2048: host_integer_scalar_mul_radix>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), decomposed_scalar, has_at_least_one_set, - reinterpret_cast *>(mem), bsk, - static_cast(ksk), lwe_dimension, message_modulus, - num_blocks, num_scalars); + reinterpret_cast *>(mem), bsks, + (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks, + num_scalars); break; case 4096: host_integer_scalar_mul_radix>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), decomposed_scalar, has_at_least_one_set, - reinterpret_cast *>(mem), bsk, - static_cast(ksk), lwe_dimension, message_modulus, - num_blocks, num_scalars); + reinterpret_cast *>(mem), bsks, + (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks, + num_scalars); break; case 8192: host_integer_scalar_mul_radix>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), decomposed_scalar, has_at_least_one_set, - reinterpret_cast *>(mem), bsk, - static_cast(ksk), lwe_dimension, message_modulus, - num_blocks, num_scalars); + reinterpret_cast *>(mem), bsks, + (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks, + num_scalars); break; case 16384: host_integer_scalar_mul_radix>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), decomposed_scalar, has_at_least_one_set, - reinterpret_cast *>(mem), bsk, - static_cast(ksk), lwe_dimension, message_modulus, - num_blocks, num_scalars); + reinterpret_cast *>(mem), bsks, + (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks, + num_scalars); break; default: PANIC("Cuda error (scalar multiplication): unsupported polynomial size. " @@ -85,12 +86,13 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( } } -void cleanup_cuda_integer_radix_scalar_mul(void *stream, uint32_t gpu_index, +void cleanup_cuda_integer_radix_scalar_mul(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void) { - cudaSetDevice(gpu_index); int_scalar_mul_buffer *mem_ptr = (int_scalar_mul_buffer *)(*mem_ptr_void); - mem_ptr->release(static_cast(stream), gpu_index); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh index ce0cec85b..b0f970889 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh @@ -29,13 +29,12 @@ __global__ void device_small_scalar_radix_multiplication(T *output_lwe_array, template __host__ void scratch_cuda_integer_radix_scalar_mul_kb( - cudaStream_t stream, uint32_t gpu_index, int_scalar_mul_buffer **mem_ptr, - uint32_t num_radix_blocks, int_radix_params params, - bool allocate_gpu_memory) { + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int_scalar_mul_buffer **mem_ptr, uint32_t num_radix_blocks, + int_radix_params params, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(T); - if (sm_size < cuda_get_max_shared_memory(gpu_index)) { + if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) { check_cuda_error(cudaFuncSetAttribute( tree_add_chunks, cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size)); @@ -50,22 +49,22 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb( check_cuda_error(cudaGetLastError()); } - *mem_ptr = new int_scalar_mul_buffer( - stream, gpu_index, params, num_radix_blocks, allocate_gpu_memory); + *mem_ptr = + new int_scalar_mul_buffer(streams, gpu_indexes, gpu_count, params, + num_radix_blocks, allocate_gpu_memory); } template __host__ void host_integer_scalar_mul_radix( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, T *lwe_array, T *decomposed_scalar, T *has_at_least_one_set, - int_scalar_mul_buffer *mem, void *bsk, T *ksk, + int_scalar_mul_buffer *mem, void **bsks, T **ksks, uint32_t input_lwe_dimension, uint32_t message_modulus, uint32_t num_radix_blocks, uint32_t num_scalars) { if (num_radix_blocks == 0 | num_scalars == 0) return; - cudaSetDevice(gpu_indexes[0]); // lwe_size includes the presence of the body // whereas lwe_dimension is the number of elements in the mask uint32_t lwe_size = input_lwe_dimension + 1; @@ -84,7 +83,7 @@ __host__ void host_integer_scalar_mul_radix( streams[0], gpu_indexes[0]); host_integer_radix_logical_scalar_shift_kb_inplace( streams, gpu_indexes, gpu_count, ptr, shift_amount, - mem->logical_scalar_shift_buffer, bsk, ksk, num_radix_blocks); + mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks); } else { // create trivial assign for value = 0 cuda_memset_async(ptr, 0, num_radix_blocks * lwe_size_bytes, streams[0], @@ -120,8 +119,8 @@ __host__ void host_integer_scalar_mul_radix( } host_integer_sum_ciphertexts_vec_kb( streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, - terms_degree, bsk, ksk, mem->sum_ciphertexts_vec_mem, num_radix_blocks, - j); + terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem, + num_radix_blocks, j); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu index eb484558d..51f5da598 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu @@ -1,12 +1,12 @@ #include "scalar_rotate.cuh" void scratch_cuda_integer_radix_scalar_rotate_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, @@ -15,28 +15,30 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64( message_modulus, carry_modulus); scratch_cuda_integer_radix_scalar_rotate_kb( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_logical_scalar_shift_buffer **)mem_ptr, num_blocks, params, shift_type, allocate_gpu_memory); } void cuda_integer_radix_scalar_rotate_kb_64_inplace( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint32_t n, int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks) { + uint32_t n, int8_t *mem_ptr, void **bsks, void **ksks, + uint32_t num_blocks) { host_integer_radix_scalar_rotate_kb_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), n, - (int_logical_scalar_shift_buffer *)mem_ptr, bsk, - static_cast(ksk), num_blocks); + (int_logical_scalar_shift_buffer *)mem_ptr, bsks, + (uint64_t **)(ksks), num_blocks); } -void cleanup_cuda_integer_radix_scalar_rotate(void *stream, uint32_t gpu_index, +void cleanup_cuda_integer_radix_scalar_rotate(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void) { - cudaSetDevice(gpu_index); int_logical_scalar_shift_buffer *mem_ptr = (int_logical_scalar_shift_buffer *)(*mem_ptr_void); - mem_ptr->release(static_cast(stream), gpu_index); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh index 1fe5bc38d..2666951a3 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh @@ -13,14 +13,13 @@ template __host__ void scratch_cuda_integer_radix_scalar_rotate_kb( - cudaStream_t stream, uint32_t gpu_index, + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_logical_scalar_shift_buffer **mem_ptr, uint32_t num_radix_blocks, int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); *mem_ptr = new int_logical_scalar_shift_buffer( - stream, gpu_index, shift_type, params, num_radix_blocks, + streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks, allocate_gpu_memory); } @@ -28,9 +27,7 @@ template __host__ void host_integer_radix_scalar_rotate_kb_inplace( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array, uint32_t n, int_logical_scalar_shift_buffer *mem, - void *bsk, Torus *ksk, uint32_t num_blocks) { - - cudaSetDevice(gpu_indexes[0]); + void **bsks, Torus **ksks, uint32_t num_blocks) { auto params = mem->params; auto glwe_dimension = params.glwe_dimension; @@ -81,7 +78,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace( integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks, - giver_blocks, bsk, ksk, num_blocks, lut_bivariate, + giver_blocks, bsks, ksks, num_blocks, lut_bivariate, lut_bivariate->params.message_modulus); } else { @@ -105,7 +102,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace( integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks, - giver_blocks, bsk, ksk, num_blocks, lut_bivariate, + giver_blocks, bsks, ksks, num_blocks, lut_bivariate, lut_bivariate->params.message_modulus); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu index 9f1a892e7..ea1e226cf 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu @@ -1,12 +1,12 @@ #include "scalar_shifts.cuh" void scratch_cuda_integer_radix_logical_scalar_shift_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, @@ -15,7 +15,7 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64( message_modulus, carry_modulus); scratch_cuda_integer_radix_logical_scalar_shift_kb( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_logical_scalar_shift_buffer **)mem_ptr, num_blocks, params, shift_type, allocate_gpu_memory); } @@ -26,23 +26,23 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64( /// rotations - 1 The remaining blocks are padded with zeros void cuda_integer_radix_logical_scalar_shift_kb_64_inplace( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint32_t shift, int8_t *mem_ptr, void *bsk, void *ksk, + uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks) { host_integer_radix_logical_scalar_shift_kb_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), shift, - (int_logical_scalar_shift_buffer *)mem_ptr, bsk, - static_cast(ksk), num_blocks); + (int_logical_scalar_shift_buffer *)mem_ptr, bsks, + (uint64_t **)(ksks), num_blocks); } void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, @@ -51,7 +51,7 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64( message_modulus, carry_modulus); scratch_cuda_integer_radix_arithmetic_scalar_shift_kb( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_arithmetic_scalar_shift_buffer **)mem_ptr, num_blocks, params, shift_type, allocate_gpu_memory); } @@ -65,34 +65,34 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64( /// zeros as would be done in the logical shift. void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint32_t shift, int8_t *mem_ptr, void *bsk, void *ksk, + uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks) { host_integer_radix_arithmetic_scalar_shift_kb_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), shift, - (int_arithmetic_scalar_shift_buffer *)mem_ptr, bsk, - static_cast(ksk), num_blocks); + (int_arithmetic_scalar_shift_buffer *)mem_ptr, bsks, + (uint64_t **)(ksks), num_blocks); } -void cleanup_cuda_integer_radix_logical_scalar_shift(void *stream, - uint32_t gpu_index, +void cleanup_cuda_integer_radix_logical_scalar_shift(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void) { - cudaSetDevice(gpu_index); int_logical_scalar_shift_buffer *mem_ptr = (int_logical_scalar_shift_buffer *)(*mem_ptr_void); - mem_ptr->release(static_cast(stream), gpu_index); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } -void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void *stream, - uint32_t gpu_index, +void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void) { - cudaSetDevice(gpu_index); int_arithmetic_scalar_shift_buffer *mem_ptr = (int_arithmetic_scalar_shift_buffer *)(*mem_ptr_void); - mem_ptr->release(static_cast(stream), gpu_index); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh index ed232f3a3..d8f58362c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh @@ -14,14 +14,13 @@ template __host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb( - cudaStream_t stream, uint32_t gpu_index, + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_logical_scalar_shift_buffer **mem_ptr, uint32_t num_radix_blocks, int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); *mem_ptr = new int_logical_scalar_shift_buffer( - stream, gpu_index, shift_type, params, num_radix_blocks, + streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks, allocate_gpu_memory); } @@ -29,11 +28,9 @@ template __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array, uint32_t shift, - int_logical_scalar_shift_buffer *mem, void *bsk, Torus *ksk, + int_logical_scalar_shift_buffer *mem, void **bsks, Torus **ksks, uint32_t num_blocks) { - cudaSetDevice(gpu_indexes[0]); - auto params = mem->params; auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; @@ -87,7 +84,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, partial_current_blocks, - partial_current_blocks, partial_previous_blocks, bsk, ksk, + partial_current_blocks, partial_previous_blocks, bsks, ksks, partial_block_count, lut_bivariate, lut_bivariate->params.message_modulus); @@ -117,7 +114,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, partial_current_blocks, - partial_current_blocks, partial_next_blocks, bsk, ksk, + partial_current_blocks, partial_next_blocks, bsks, ksks, partial_block_count, lut_bivariate, lut_bivariate->params.message_modulus); } @@ -125,14 +122,13 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( template __host__ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb( - cudaStream_t stream, uint32_t gpu_index, + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_arithmetic_scalar_shift_buffer **mem_ptr, uint32_t num_radix_blocks, int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); *mem_ptr = new int_arithmetic_scalar_shift_buffer( - stream, gpu_index, shift_type, params, num_radix_blocks, + streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks, allocate_gpu_memory); } @@ -140,7 +136,7 @@ template __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array, uint32_t shift, - int_arithmetic_scalar_shift_buffer *mem, void *bsk, Torus *ksk, + int_arithmetic_scalar_shift_buffer *mem, void **bsks, Torus **ksks, uint32_t num_blocks) { cudaSetDevice(gpu_indexes[0]); @@ -215,7 +211,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( if (shift_within_block != 0 && rotations != num_blocks) { integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, partial_current_blocks, - partial_current_blocks, partial_next_blocks, bsk, ksk, + partial_current_blocks, partial_next_blocks, bsks, ksks, partial_block_count, lut_bivariate, lut_bivariate->params.message_modulus); } @@ -229,7 +225,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( { integer_radix_apply_univariate_lookup_table_kb( &mem->local_stream_1, &gpu_indexes[0], 1, padding_block, - last_block_copy, bsk, ksk, 1, lut_univariate_padding_block); + last_block_copy, bsks, ksks, 1, lut_univariate_padding_block); // Replace blocks 'pulled' from the left with the correct padding block for (uint i = 0; i < rotations; i++) { cuda_memcpy_async_gpu_to_gpu( @@ -243,7 +239,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( if (shift_within_block != 0 && rotations != num_blocks) { integer_radix_apply_univariate_lookup_table_kb( &mem->local_stream_2, &gpu_indexes[0], 1, last_block, - last_block_copy, bsk, ksk, 1, lut_univariate_shift_last_block); + last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block); } } } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu index 1936e241c..bf2ba84d1 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu @@ -1,13 +1,13 @@ #include "shift_and_rotate.cuh" void scratch_cuda_integer_radix_shift_and_rotate_kb_64( - void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, - bool allocate_gpu_memory) { + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, + bool is_signed, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, @@ -15,29 +15,29 @@ void scratch_cuda_integer_radix_shift_and_rotate_kb_64( message_modulus, carry_modulus); scratch_cuda_integer_radix_shift_and_rotate_kb( - static_cast(stream), gpu_index, + (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_shift_and_rotate_buffer **)mem_ptr, num_blocks, params, shift_type, is_signed, allocate_gpu_memory); } void cuda_integer_radix_shift_and_rotate_kb_64_inplace( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - void *lwe_shift, int8_t *mem_ptr, void *bsk, void *ksk, + void *lwe_shift, int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks) { host_integer_radix_shift_and_rotate_kb_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), static_cast(lwe_shift), - (int_shift_and_rotate_buffer *)mem_ptr, bsk, - static_cast(ksk), num_blocks); + (int_shift_and_rotate_buffer *)mem_ptr, bsks, + (uint64_t **)(ksks), num_blocks); } -void cleanup_cuda_integer_radix_shift_and_rotate(void *stream, - uint32_t gpu_index, +void cleanup_cuda_integer_radix_shift_and_rotate(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void) { - cudaSetDevice(gpu_index); int_shift_and_rotate_buffer *mem_ptr = (int_shift_and_rotate_buffer *)(*mem_ptr_void); - mem_ptr->release(static_cast(stream), gpu_index); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh index e52a998f6..2106cd629 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh @@ -14,21 +14,20 @@ template __host__ void scratch_cuda_integer_radix_shift_and_rotate_kb( - cudaStream_t stream, uint32_t gpu_index, + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_shift_and_rotate_buffer **mem_ptr, uint32_t num_radix_blocks, int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); *mem_ptr = new int_shift_and_rotate_buffer( - stream, gpu_index, shift_type, is_signed, params, num_radix_blocks, - allocate_gpu_memory); + streams, gpu_indexes, gpu_count, shift_type, is_signed, params, + num_radix_blocks, allocate_gpu_memory); } template __host__ void host_integer_radix_shift_and_rotate_kb_inplace( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *lwe_array, Torus *lwe_shift, int_shift_and_rotate_buffer *mem, - void *bsk, Torus *ksk, uint32_t num_radix_blocks) { + void **bsks, Torus **ksks, uint32_t num_radix_blocks) { uint32_t bits_per_block = std::log2(mem->params.message_modulus); uint32_t total_nb_bits = bits_per_block * num_radix_blocks; if (total_nb_bits == 0) @@ -42,8 +41,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( // Extract all bits auto bits = mem->tmp_bits; - extract_n_bits(streams, gpu_indexes, gpu_count, bits, lwe_array, bsk, - ksk, num_radix_blocks, bits_per_block, + extract_n_bits(streams, gpu_indexes, gpu_count, bits, lwe_array, bsks, + ksks, num_radix_blocks, bits_per_block, mem->bit_extract_luts); // Extract shift bits @@ -64,7 +63,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( // so that it is already aligned to the correct position of the cmux input // and we reduce noise growth extract_n_bits(streams, gpu_indexes, gpu_count, shift_bits, lwe_shift, - bsk, ksk, 1, max_num_bits_that_tell_shift, + bsks, ksks, 1, max_num_bits_that_tell_shift, mem->bit_extract_luts_with_offset_2); // If signed, do an "arithmetic shift" by padding with the sign bit @@ -154,7 +153,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( // we have // control_bit|b|a integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsk, ksk, + streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks, total_nb_bits, mux_lut); } @@ -192,7 +191,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( // To give back a clean ciphertext auto cleaning_lut = mem->cleaning_lut; integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsk, ksk, + streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsks, ksks, num_radix_blocks, cleaning_lut); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh index 836f4df9f..7e7695989 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh @@ -7,7 +7,7 @@ #endif #include "device.h" -#include "helper.h" +#include "helper_multi_gpu.h" #include "linear_algebra.h" #include "utils/kernel_dimensions.cuh" #include diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh index fb2412456..4e46756d5 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh @@ -65,7 +65,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, double2 *dest, ST *src, uint32_t polynomial_size, uint32_t total_polynomials) { - cudaSetDevice(gpu_index); int shared_memory_size = sizeof(double) * polynomial_size; diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index b19d71be7..9760be2c8 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -7,6 +7,7 @@ #include "programmable_bootstrap_multibit.h" #include "cooperative_groups.h" +#include "helper_multi_gpu.h" using namespace cooperative_groups; namespace cg = cooperative_groups; @@ -117,18 +118,19 @@ __device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft, } template -void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, - Torus *lwe_input_indexes, void *bootstrapping_key, - int8_t *pbs_buffer, uint32_t glwe_dimension, - uint32_t lwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, - uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count, - uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory, PBS_TYPE pbs_type) { - auto num_inputs_on_gpu = input_lwe_ciphertext_count / gpu_count; +void execute_pbs( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector, + Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, + void **bootstrapping_keys, std::vector pbs_buffer, + uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t grouping_factor, + uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx, + uint32_t max_shared_memory, PBS_TYPE pbs_type, bool sync_streams = true) { + auto active_gpu_count = + get_active_gpu_count(input_lwe_ciphertext_count, gpu_count); + if (sync_streams) + cuda_synchronize_stream(streams[0], gpu_indexes[0]); switch (sizeof(Torus)) { case sizeof(uint32_t): // 32 bits @@ -136,14 +138,24 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes, case MULTI_BIT: PANIC("Error: 32-bit multibit PBS is not supported.\n") case CLASSICAL: - cuda_programmable_bootstrap_lwe_ciphertext_vector_32( - streams[0], gpu_indexes[0], lwe_array_out, lwe_output_indexes, - lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, - bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts, - lwe_idx, max_shared_memory); +#pragma omp parallel for num_threads(active_gpu_count) + for (uint i = 0; i < active_gpu_count; i++) { + int num_inputs_on_gpu = + get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count); + int gpu_offset = + get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count); + auto d_lut_vector_indexes = + lut_vector_indexes + (ptrdiff_t)(gpu_offset); + cuda_programmable_bootstrap_lwe_ciphertext_vector_32( + streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes, + lut_vector, d_lut_vector_indexes, lwe_array_in, lwe_input_indexes, + bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, + polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts, + lwe_idx, max_shared_memory, gpu_offset); + } break; default: + PANIC("Error: unsupported cuda PBS type.") break; } break; @@ -153,20 +165,39 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes, case MULTI_BIT: if (grouping_factor == 0) PANIC("Multi-bit PBS error: grouping factor should be > 0.") - cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( - streams[0], gpu_indexes[0], lwe_array_out, lwe_output_indexes, - lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, - bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension, - polynomial_size, grouping_factor, base_log, level_count, - num_inputs_on_gpu, num_luts, lwe_idx, max_shared_memory); +#pragma omp parallel for num_threads(active_gpu_count) + for (uint i = 0; i < active_gpu_count; i++) { + int num_inputs_on_gpu = + get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count); + int gpu_offset = + get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count); + auto d_lut_vector_indexes = + lut_vector_indexes + (ptrdiff_t)(gpu_offset); + cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( + streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes, + lut_vector, d_lut_vector_indexes, lwe_array_in, lwe_input_indexes, + bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, + polynomial_size, grouping_factor, base_log, level_count, + num_inputs_on_gpu, num_luts, lwe_idx, max_shared_memory, + gpu_offset); + } break; case CLASSICAL: - cuda_programmable_bootstrap_lwe_ciphertext_vector_64( - streams[0], gpu_indexes[0], lwe_array_out, lwe_output_indexes, - lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, - bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts, - lwe_idx, max_shared_memory); +#pragma omp parallel for num_threads(active_gpu_count) + for (uint i = 0; i < active_gpu_count; i++) { + int num_inputs_on_gpu = + get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count); + int gpu_offset = + get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count); + auto d_lut_vector_indexes = + lut_vector_indexes + (ptrdiff_t)(gpu_offset); + cuda_programmable_bootstrap_lwe_ciphertext_vector_64( + streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes, + lut_vector, d_lut_vector_indexes, lwe_array_in, lwe_input_indexes, + bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, + polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts, + lwe_idx, max_shared_memory, gpu_offset); + } break; default: PANIC("Error: unsupported cuda PBS type.") @@ -176,6 +207,11 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes, PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer " "moduli are supported.") } + + if (sync_streams) + for (uint i = 0; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } } template @@ -186,8 +222,6 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index, uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, PBS_TYPE pbs_type, bool allocate_gpu_memory) { - if (gpu_index != 0) - PANIC("GPU error (pbs): all memory has to reside in GPU 0.") switch (sizeof(Torus)) { case sizeof(uint32_t): // 32 bits diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu index 89c1d2a9c..de2f01d5b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu @@ -158,7 +158,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory) { + uint32_t max_shared_memory, uint32_t gpu_offset) { if (base_log > 32) PANIC("Cuda error (amortized PBS): base log should be > number of bits in " @@ -172,7 +172,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); break; case 512: host_programmable_bootstrap_amortized>( @@ -181,7 +181,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); break; case 1024: host_programmable_bootstrap_amortized>( @@ -190,7 +190,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); break; case 2048: host_programmable_bootstrap_amortized>( @@ -199,7 +199,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); break; case 4096: host_programmable_bootstrap_amortized>( @@ -208,7 +208,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); break; case 8192: host_programmable_bootstrap_amortized>( @@ -217,7 +217,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); break; case 16384: host_programmable_bootstrap_amortized>( @@ -226,7 +226,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); break; default: PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported " @@ -307,7 +307,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory) { + uint32_t max_shared_memory, uint32_t gpu_offset) { if (base_log > 64) PANIC("Cuda error (amortized PBS): base log should be > number of bits in " @@ -321,7 +321,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); break; case 512: host_programmable_bootstrap_amortized>( @@ -330,7 +330,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); break; case 1024: host_programmable_bootstrap_amortized>( @@ -339,7 +339,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); break; case 2048: host_programmable_bootstrap_amortized>( @@ -348,7 +348,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); break; case 4096: host_programmable_bootstrap_amortized>( @@ -357,7 +357,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); break; case 8192: host_programmable_bootstrap_amortized>( @@ -366,7 +366,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); break; case 16384: host_programmable_bootstrap_amortized>( @@ -375,7 +375,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset); break; default: PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported " diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh index d5c674a44..eae1b593f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh @@ -52,7 +52,7 @@ __global__ void device_programmable_bootstrap_amortized( double2 *bootstrapping_key, int8_t *device_mem, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t lwe_idx, - size_t device_memory_size_per_sample) { + size_t device_memory_size_per_sample, uint32_t gpu_offset) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is // much faster than global memory @@ -79,7 +79,8 @@ __global__ void device_programmable_bootstrap_amortized( (ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2); auto block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.x + gpu_offset] * + (lwe_dimension + 1)]; Torus *block_lut_vector = &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * (glwe_dimension + 1)]; @@ -197,7 +198,7 @@ __global__ void device_programmable_bootstrap_amortized( } auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.x] * + &lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] * (glwe_dimension * polynomial_size + 1)]; // The blind rotation for this block is over @@ -257,8 +258,8 @@ __host__ void scratch_programmable_bootstrap_amortized( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); + cudaSetDevice(gpu_index); uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_amortized( polynomial_size, glwe_dimension); @@ -298,7 +299,7 @@ __host__ void host_programmable_bootstrap_amortized( int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory) { + uint32_t max_shared_memory, uint32_t gpu_offset) { cudaSetDevice(gpu_index); uint64_t SM_FULL = @@ -332,14 +333,14 @@ __host__ void host_programmable_bootstrap_amortized( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, - level_count, lwe_idx, DM_FULL); + level_count, lwe_idx, DM_FULL, gpu_offset); } else if (max_shared_memory < SM_FULL) { device_programmable_bootstrap_amortized <<>>( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, - level_count, lwe_idx, DM_PART); + level_count, lwe_idx, DM_PART, gpu_offset); } else { // For devices with compute capability 7.x a single thread block can // address the full capacity of shared memory. Shared memory on the @@ -351,7 +352,7 @@ __host__ void host_programmable_bootstrap_amortized( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, - level_count, lwe_idx, 0); + level_count, lwe_idx, 0, gpu_offset); } check_cuda_error(cudaGetLastError()); } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index 922f71955..97c01bdfe 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -40,7 +40,8 @@ __global__ void device_programmable_bootstrap_cg( Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - int8_t *device_mem, uint64_t device_memory_size_per_block) { + int8_t *device_mem, uint64_t device_memory_size_per_block, + uint32_t gpu_offset) { grid_group grid = this_grid(); @@ -74,7 +75,8 @@ __global__ void device_programmable_bootstrap_cg( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * + (lwe_dimension + 1)]; Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * (glwe_dimension + 1)]; @@ -138,7 +140,7 @@ __global__ void device_programmable_bootstrap_cg( } auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.z] * + &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -161,7 +163,6 @@ __host__ void scratch_programmable_bootstrap_cg( bool allocate_gpu_memory) { cudaSetDevice(gpu_index); - uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_cg(polynomial_size); uint64_t partial_sm = @@ -201,7 +202,7 @@ __host__ void host_programmable_bootstrap_cg( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t num_luts, uint32_t max_shared_memory) { + uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) { cudaSetDevice(gpu_index); // With SM each block corresponds to either the mask or body, no need to @@ -223,7 +224,7 @@ __host__ void host_programmable_bootstrap_cg( int thds = polynomial_size / params::opt; dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count); - void *kernel_args[14]; + void *kernel_args[15]; kernel_args[0] = &lwe_array_out; kernel_args[1] = &lwe_output_indexes; kernel_args[2] = &lut_vector; @@ -237,6 +238,7 @@ __host__ void host_programmable_bootstrap_cg( kernel_args[10] = &base_log; kernel_args[11] = &level_count; kernel_args[12] = &d_mem; + kernel_args[14] = &gpu_offset; if (max_shared_memory < partial_sm) { kernel_args[13] = &full_dm; diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index bf0564132..5eba4f2f8 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -26,7 +26,7 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate( uint32_t base_log, uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, int8_t *device_mem, - uint64_t device_memory_size_per_block) { + uint64_t device_memory_size_per_block, uint32_t gpu_offset) { grid_group grid = this_grid(); @@ -55,7 +55,8 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * + (lwe_dimension + 1)]; Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * (glwe_dimension + 1)]; @@ -116,7 +117,7 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate( if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) { auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.z] * + &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -177,7 +178,6 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap( bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); - uint64_t full_sm_keybundle = get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( polynomial_size); @@ -258,7 +258,8 @@ __host__ void execute_cg_external_product_loop( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset) { + uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset, + uint32_t gpu_offset) { cudaSetDevice(gpu_index); uint64_t full_dm = @@ -281,7 +282,7 @@ __host__ void execute_cg_external_product_loop( auto global_accumulator = buffer->global_accumulator; auto buffer_fft = buffer->global_accumulator_fft; - void *kernel_args[20]; + void *kernel_args[21]; kernel_args[0] = &lwe_array_out; kernel_args[1] = &lwe_output_indexes; kernel_args[2] = &lut_vector; @@ -301,6 +302,7 @@ __host__ void execute_cg_external_product_loop( kernel_args[16] = &chunk_size; kernel_args[17] = &keybundle_size_per_input; kernel_args[18] = &d_mem; + kernel_args[20] = &gpu_offset; dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples); dim3 thds(polynomial_size / params::opt, 1, 1); @@ -335,7 +337,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size = 0) { + uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); if (!lwe_chunk_size) @@ -350,7 +352,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, max_shared_memory, - lwe_chunk_size, lwe_offset); + lwe_chunk_size, lwe_offset, gpu_offset); // Accumulate execute_cg_external_product_loop( @@ -358,7 +360,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, lwe_chunk_size, - max_shared_memory, lwe_offset); + max_shared_memory, lwe_offset, gpu_offset); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu index 2875491d3..8f2a59e80 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu @@ -132,7 +132,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory) { + uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) { switch (polynomial_size) { case 256: @@ -141,7 +141,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 512: host_programmable_bootstrap_tbc>( @@ -149,7 +149,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 1024: host_programmable_bootstrap_tbc>( @@ -157,7 +157,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 2048: host_programmable_bootstrap_tbc>( @@ -165,7 +165,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 4096: host_programmable_bootstrap_tbc>( @@ -173,7 +173,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 8192: host_programmable_bootstrap_tbc>( @@ -181,7 +181,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 16384: host_programmable_bootstrap_tbc>( @@ -189,7 +189,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -411,7 +411,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory) { + uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) { switch (polynomial_size) { case 256: @@ -420,7 +420,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 512: host_programmable_bootstrap_cg>( @@ -428,7 +428,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 1024: host_programmable_bootstrap_cg>( @@ -436,7 +436,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 2048: host_programmable_bootstrap_cg>( @@ -444,7 +444,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 4096: host_programmable_bootstrap_cg>( @@ -452,7 +452,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 8192: host_programmable_bootstrap_cg>( @@ -460,7 +460,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 16384: host_programmable_bootstrap_cg>( @@ -468,7 +468,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -485,7 +485,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory) { + uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) { switch (polynomial_size) { case 256: @@ -494,7 +494,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 512: host_programmable_bootstrap>( @@ -502,7 +502,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 1024: host_programmable_bootstrap>( @@ -510,7 +510,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 2048: host_programmable_bootstrap>( @@ -518,7 +518,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 4096: host_programmable_bootstrap>( @@ -526,7 +526,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 8192: host_programmable_bootstrap>( @@ -534,7 +534,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; case 16384: host_programmable_bootstrap>( @@ -542,7 +542,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + num_luts, max_shared_memory, gpu_offset); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -560,7 +560,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory) { + uint32_t max_shared_memory, uint32_t gpu_offset) { if (base_log > 32) PANIC("Cuda error (classical PBS): base log should be > number of bits " @@ -582,7 +582,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(bootstrapping_key), (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory); + num_luts, lwe_idx, max_shared_memory, gpu_offset); #else PANIC("Cuda error (PBS): TBC pbs is not supported.") #endif @@ -598,7 +598,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(bootstrapping_key), (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory); + num_luts, lwe_idx, max_shared_memory, gpu_offset); break; case DEFAULT: cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -611,7 +611,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(bootstrapping_key), (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory); + num_luts, lwe_idx, max_shared_memory, gpu_offset); break; default: PANIC("Cuda error (PBS): unknown pbs variant.") @@ -697,7 +697,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory) { + uint32_t max_shared_memory, uint32_t gpu_offset) { if (base_log > 64) PANIC("Cuda error (classical PBS): base log should be > number of bits " "in the ciphertext representation (64)"); @@ -718,7 +718,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(bootstrapping_key), (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory); + num_luts, lwe_idx, max_shared_memory, gpu_offset); #else PANIC("Cuda error (PBS): TBC pbs is not supported.") #endif @@ -734,7 +734,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(bootstrapping_key), (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory); + num_luts, lwe_idx, max_shared_memory, gpu_offset); break; case PBS_VARIANT::DEFAULT: cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -747,7 +747,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(bootstrapping_key), (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory); + num_luts, lwe_idx, max_shared_memory, gpu_offset); break; default: PANIC("Cuda error (PBS): unknown pbs variant.") @@ -777,7 +777,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, @@ -787,7 +787,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset); template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, @@ -810,7 +810,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint32_t *lwe_array_out, @@ -820,7 +820,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset); template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, @@ -851,7 +851,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset); template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, uint64_t *lwe_output_indexes, uint64_t *lut_vector, @@ -860,7 +860,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset); template void scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index fd811d84f..baa50954c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -23,7 +23,7 @@ __global__ void device_programmable_bootstrap_step_one( Torus *global_accumulator, double2 *global_accumulator_fft, uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *device_mem, - uint64_t device_memory_size_per_block) { + uint64_t device_memory_size_per_block, uint32_t gpu_offset) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is @@ -51,7 +51,8 @@ __global__ void device_programmable_bootstrap_step_one( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * + (lwe_dimension + 1)]; Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * (glwe_dimension + 1)]; @@ -133,7 +134,7 @@ __global__ void device_programmable_bootstrap_step_two( Torus *global_accumulator, double2 *global_accumulator_fft, uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *device_mem, - uint64_t device_memory_size_per_block) { + uint64_t device_memory_size_per_block, uint32_t gpu_offset) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is @@ -199,7 +200,7 @@ __global__ void device_programmable_bootstrap_step_two( if (lwe_iteration + 1 == lwe_dimension) { // Last iteration auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.x] * + &lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -271,7 +272,6 @@ __host__ void scratch_programmable_bootstrap( bool allocate_gpu_memory) { cudaSetDevice(gpu_index); - uint64_t full_sm_step_one = get_buffer_size_full_sm_programmable_bootstrap_step_one( polynomial_size); @@ -325,15 +325,17 @@ __host__ void scratch_programmable_bootstrap( } template -__host__ void execute_step_one( - cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, - double2 *bootstrapping_key, Torus *global_accumulator, - double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, int8_t *d_mem, - uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm, - uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) { +__host__ void +execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, + Torus *lut_vector_indexes, Torus *lwe_array_in, + Torus *lwe_input_indexes, double2 *bootstrapping_key, + Torus *global_accumulator, double2 *global_accumulator_fft, + uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, int8_t *d_mem, + uint32_t max_shared_memory, int lwe_iteration, + uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm, + uint64_t full_dm, uint32_t gpu_offset) { cudaSetDevice(gpu_index); int thds = polynomial_size / params::opt; @@ -345,35 +347,37 @@ __host__ void execute_step_one( lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, full_dm); + level_count, d_mem, full_dm, gpu_offset); } else if (max_shared_memory < full_sm) { device_programmable_bootstrap_step_one <<>>( lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, partial_dm); + level_count, d_mem, partial_dm, gpu_offset); } else { device_programmable_bootstrap_step_one <<>>( lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, 0); + level_count, d_mem, 0, gpu_offset); } check_cuda_error(cudaGetLastError()); } template -__host__ void execute_step_two( - cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - double2 *bootstrapping_key, Torus *global_accumulator, - double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, int8_t *d_mem, - uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm, - uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) { +__host__ void +execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, + Torus *lwe_output_indexes, Torus *lut_vector, + Torus *lut_vector_indexes, double2 *bootstrapping_key, + Torus *global_accumulator, double2 *global_accumulator_fft, + uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, int8_t *d_mem, + uint32_t max_shared_memory, int lwe_iteration, + uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm, + uint64_t full_dm, uint32_t gpu_offset) { cudaSetDevice(gpu_index); int thds = polynomial_size / params::opt; @@ -385,21 +389,21 @@ __host__ void execute_step_two( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, full_dm); + level_count, d_mem, full_dm, gpu_offset); } else if (max_shared_memory < full_sm) { device_programmable_bootstrap_step_two <<>>( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, partial_dm); + level_count, d_mem, partial_dm, gpu_offset); } else { device_programmable_bootstrap_step_two <<>>( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, 0); + level_count, d_mem, 0, gpu_offset); } check_cuda_error(cudaGetLastError()); } @@ -414,7 +418,7 @@ __host__ void host_programmable_bootstrap( pbs_buffer *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t num_luts, uint32_t max_shared_memory) { + uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) { cudaSetDevice(gpu_index); // With SM each block corresponds to either the mask or body, no need to @@ -445,14 +449,14 @@ __host__ void host_programmable_bootstrap( global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, d_mem, max_shared_memory, i, partial_sm, partial_dm_step_one, full_sm_step_one, - full_dm_step_one); + full_dm_step_one, gpu_offset); execute_step_two( stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, d_mem, max_shared_memory, i, partial_sm, partial_dm_step_two, full_sm_step_two, - full_dm_step_two); + full_dm_step_two, gpu_offset); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu index e994e203a..f04a3c8be 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu @@ -75,7 +75,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size) { + uint32_t gpu_offset, uint32_t lwe_chunk_size) { if (base_log > 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -89,7 +89,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset, + lwe_chunk_size); break; case 512: host_cg_multi_bit_programmable_bootstrap 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -175,7 +182,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset, + lwe_chunk_size); break; case 512: host_multi_bit_programmable_bootstrap>( @@ -183,7 +191,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset, + lwe_chunk_size); break; case 1024: host_multi_bit_programmable_bootstrap *buffer = (pbs_buffer *)mem_ptr; @@ -262,7 +277,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(bootstrapping_key), (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset, + lwe_chunk_size); #else PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.") #endif @@ -277,7 +293,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset, + lwe_chunk_size); break; case PBS_VARIANT::DEFAULT: cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -289,7 +306,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset, + lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.") @@ -557,7 +575,7 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size); + uint32_t gpu_offset, uint32_t lwe_chunk_size); template void scratch_cuda_cg_multi_bit_programmable_bootstrap( @@ -577,7 +595,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size); + uint32_t gpu_offset, uint32_t lwe_chunk_size); template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( @@ -665,7 +683,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size) { + uint32_t gpu_offset, uint32_t lwe_chunk_size) { if (base_log > 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -679,7 +697,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset, + lwe_chunk_size); break; case 512: host_tbc_multi_bit_programmable_bootstrap( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size); + uint32_t gpu_offset, uint32_t lwe_chunk_size); #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index db971d7b7..8b8abc0f7 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -39,7 +39,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, int8_t *device_mem, - uint64_t device_memory_size_per_block) { + uint64_t device_memory_size_per_block, uint32_t gpu_offset) { extern __shared__ int8_t sharedmem[]; int8_t *selected_memory = sharedmem; @@ -64,7 +64,8 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( Torus *accumulator = (Torus *)selected_memory; Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[input_idx + gpu_offset] * + (lwe_dimension + 1)]; double2 *keybundle = keybundle_array + // select the input @@ -152,7 +153,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one( double2 *global_accumulator_fft, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem, - uint64_t device_memory_size_per_block) { + uint64_t device_memory_size_per_block, uint32_t gpu_offset) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is @@ -179,7 +180,8 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one( accumulator_fft = (double2 *)sharedmem; Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * + (lwe_dimension + 1)]; Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * (glwe_dimension + 1)]; @@ -246,7 +248,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two( uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset, uint32_t lwe_chunk_size, int8_t *device_mem, - uint64_t device_memory_size_per_block) { + uint64_t device_memory_size_per_block, uint32_t gpu_offset) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is // much faster than global memory @@ -310,7 +312,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two( if (lwe_iteration + 1 == (lwe_dimension / grouping_factor)) { // Last iteration auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.x] * + &lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -377,7 +379,6 @@ __host__ void scratch_multi_bit_programmable_bootstrap( uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); - uint64_t full_sm_keybundle = get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( polynomial_size); @@ -485,7 +486,8 @@ __host__ void execute_compute_keybundle( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t max_shared_memory, uint32_t lwe_chunk_size, int lwe_offset) { + uint32_t max_shared_memory, uint32_t lwe_chunk_size, int lwe_offset, + uint32_t gpu_offset) { cudaSetDevice(gpu_index); uint32_t chunk_size = @@ -513,26 +515,27 @@ __host__ void execute_compute_keybundle( lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, lwe_offset, chunk_size, - keybundle_size_per_input, d_mem, full_sm_keybundle); + keybundle_size_per_input, d_mem, full_sm_keybundle, gpu_offset); else device_multi_bit_programmable_bootstrap_keybundle <<>>( lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, lwe_offset, chunk_size, - keybundle_size_per_input, d_mem, 0); + keybundle_size_per_input, d_mem, 0, gpu_offset); check_cuda_error(cudaGetLastError()); } template -__host__ void -execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, - Torus *lwe_input_indexes, pbs_buffer *buffer, - uint32_t num_samples, uint32_t lwe_dimension, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, - uint32_t max_shared_memory, int j, int lwe_offset) { +__host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index, + Torus *lut_vector, Torus *lut_vector_indexes, + Torus *lwe_array_in, Torus *lwe_input_indexes, + pbs_buffer *buffer, + uint32_t num_samples, uint32_t lwe_dimension, + uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t max_shared_memory, + int j, int lwe_offset, uint32_t gpu_offset) { cudaSetDevice(gpu_index); uint64_t full_sm_accumulate_step_one = @@ -557,7 +560,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes, global_accumulator, global_accumulator_fft, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, - j + lwe_offset, d_mem, full_sm_accumulate_step_one); + j + lwe_offset, d_mem, full_sm_accumulate_step_one, gpu_offset); else if (max_shared_memory < full_sm_accumulate_step_one) device_multi_bit_programmable_bootstrap_accumulate_step_one @@ -566,7 +569,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, lut_vector_indexes, global_accumulator, global_accumulator_fft, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, j + lwe_offset, - d_mem, partial_sm_accumulate_step_one); + d_mem, partial_sm_accumulate_step_one, gpu_offset); else device_multi_bit_programmable_bootstrap_accumulate_step_one @@ -575,7 +578,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, lut_vector_indexes, global_accumulator, global_accumulator_fft, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, j + lwe_offset, - d_mem, 0); + d_mem, 0, gpu_offset); check_cuda_error(cudaGetLastError()); } @@ -587,14 +590,13 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count, uint32_t max_shared_memory, int j, - int lwe_offset, uint32_t lwe_chunk_size) { + int lwe_offset, uint32_t lwe_chunk_size, uint32_t gpu_offset) { + cudaSetDevice(gpu_index); uint64_t full_sm_accumulate_step_two = get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two( polynomial_size); - cudaSetDevice(gpu_index); - // auto d_mem = buffer->d_mem_acc_step_two; auto keybundle_fft = buffer->keybundle_fft; auto global_accumulator = buffer->global_accumulator; @@ -610,7 +612,8 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, lwe_array_out, lwe_output_indexes, keybundle_fft, global_accumulator, global_accumulator_fft, lwe_dimension, glwe_dimension, polynomial_size, level_count, grouping_factor, j, - lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two); + lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two, + gpu_offset); else device_multi_bit_programmable_bootstrap_accumulate_step_two @@ -618,7 +621,8 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, stream>>>(lwe_array_out, lwe_output_indexes, keybundle_fft, global_accumulator, global_accumulator_fft, lwe_dimension, glwe_dimension, polynomial_size, level_count, - grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0); + grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0, + gpu_offset); check_cuda_error(cudaGetLastError()); } @@ -631,7 +635,7 @@ __host__ void host_multi_bit_programmable_bootstrap( uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size = 0) { + uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); // If a chunk size is not passed to this function, select one. @@ -647,7 +651,7 @@ __host__ void host_multi_bit_programmable_bootstrap( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, max_shared_memory, - lwe_chunk_size, lwe_offset); + lwe_chunk_size, lwe_offset, gpu_offset); // Accumulate uint32_t chunk_size = std::min( lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); @@ -656,13 +660,13 @@ __host__ void host_multi_bit_programmable_bootstrap( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, max_shared_memory, j, - lwe_offset); + lwe_offset, gpu_offset); execute_step_two( stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, level_count, max_shared_memory, j, lwe_offset, - lwe_chunk_size); + lwe_chunk_size, gpu_offset); } } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index 072b95a23..71aaca6b6 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -40,8 +40,8 @@ __global__ void device_programmable_bootstrap_tbc( Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - int8_t *device_mem, uint64_t device_memory_size_per_block, - bool support_dsm) { + int8_t *device_mem, uint64_t device_memory_size_per_block, bool support_dsm, + uint32_t gpu_offset) { cluster_group cluster = this_cluster(); @@ -78,7 +78,8 @@ __global__ void device_programmable_bootstrap_tbc( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * + (lwe_dimension + 1)]; Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * (glwe_dimension + 1)]; @@ -142,7 +143,7 @@ __global__ void device_programmable_bootstrap_tbc( } auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.z] * + &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -223,7 +224,7 @@ __host__ void host_programmable_bootstrap_tbc( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t num_luts, uint32_t max_shared_memory) { + uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) { cudaSetDevice(gpu_index); auto supports_dsm = @@ -277,7 +278,7 @@ __host__ void host_programmable_bootstrap_tbc( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm, - supports_dsm)); + supports_dsm, gpu_offset)); } else if (max_shared_memory < full_sm + minimum_sm_tbc) { config.dynamicSmemBytes = partial_sm + minimum_sm_tbc; @@ -286,7 +287,7 @@ __host__ void host_programmable_bootstrap_tbc( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, lwe_dimension, polynomial_size, base_log, level_count, d_mem, - partial_dm, supports_dsm)); + partial_dm, supports_dsm, gpu_offset)); } else { config.dynamicSmemBytes = full_sm + minimum_sm_tbc; @@ -295,7 +296,7 @@ __host__ void host_programmable_bootstrap_tbc( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0, - supports_dsm)); + supports_dsm, gpu_offset)); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 4bd705a9f..9378cc832 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -26,7 +26,8 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate( uint32_t base_log, uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, int8_t *device_mem, - uint64_t device_memory_size_per_block, bool support_dsm) { + uint64_t device_memory_size_per_block, bool support_dsm, + uint32_t gpu_offset) { cluster_group cluster = this_cluster(); @@ -62,7 +63,8 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps Torus *block_lwe_array_in = - &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)]; + &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * + (lwe_dimension + 1)]; Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * (glwe_dimension + 1)]; @@ -123,7 +125,7 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate( if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) { auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.z] * + &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] * (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; @@ -267,8 +269,10 @@ __host__ void execute_tbc_external_product_loop( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset) { + uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset, + uint32_t gpu_offset) { + cudaSetDevice(gpu_index); auto supports_dsm = supports_distributed_shared_memory_on_multibit_programmable_bootstrap< Torus>(polynomial_size, max_shared_memory); @@ -326,7 +330,7 @@ __host__ void execute_tbc_external_product_loop( lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft, global_accumulator, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, grouping_factor, lwe_offset, chunk_size, - keybundle_size_per_input, d_mem, full_dm, supports_dsm)); + keybundle_size_per_input, d_mem, full_dm, supports_dsm, gpu_offset)); } else if (max_shared_memory < full_dm + minimum_dm) { config.dynamicSmemBytes = partial_dm + minimum_dm; check_cuda_error(cudaLaunchKernelEx( @@ -337,7 +341,7 @@ __host__ void execute_tbc_external_product_loop( lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft, global_accumulator, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, grouping_factor, lwe_offset, chunk_size, - keybundle_size_per_input, d_mem, partial_dm, supports_dsm)); + keybundle_size_per_input, d_mem, partial_dm, supports_dsm, gpu_offset)); } else { config.dynamicSmemBytes = full_dm + minimum_dm; check_cuda_error(cudaLaunchKernelEx( @@ -348,7 +352,7 @@ __host__ void execute_tbc_external_product_loop( lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft, global_accumulator, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, grouping_factor, lwe_offset, chunk_size, - keybundle_size_per_input, d_mem, 0, supports_dsm)); + keybundle_size_per_input, d_mem, 0, supports_dsm, gpu_offset)); } } @@ -361,7 +365,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size = 0) { + uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); if (!lwe_chunk_size) @@ -376,7 +380,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, max_shared_memory, - lwe_chunk_size, lwe_offset); + lwe_chunk_size, lwe_offset, gpu_offset); // Accumulate execute_tbc_external_product_loop( @@ -384,7 +388,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, lwe_chunk_size, - max_shared_memory, lwe_offset); + max_shared_memory, lwe_offset, gpu_offset); } } diff --git a/backends/tfhe-cuda-backend/cuda/include/helper_debug.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_debug.cuh similarity index 100% rename from backends/tfhe-cuda-backend/cuda/include/helper_debug.cuh rename to backends/tfhe-cuda-backend/cuda/src/utils/helper_debug.cuh diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu new file mode 100644 index 000000000..aca9d8eff --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu @@ -0,0 +1,100 @@ +#include "device.h" +#include "helper_multi_gpu.h" + +std::mutex m; +bool p2p_enabled = false; + +int cuda_setup_multi_gpu() { + int num_gpus = cuda_get_number_of_gpus(); + if (num_gpus == 0) + PANIC("GPU error: the number of GPUs should be > 0.") + int num_used_gpus = 1; + if (num_gpus > 1) { + m.lock(); + if (!p2p_enabled) { + p2p_enabled = true; + int has_peer_access_to_device_0; + for (int i = 1; i < num_gpus; i++) { + check_cuda_error( + cudaDeviceCanAccessPeer(&has_peer_access_to_device_0, i, 0)); + if (has_peer_access_to_device_0) { + cudaMemPool_t mempool; + cudaMemAccessDesc desc = {}; + // Enable P2P Access and mempool access + check_cuda_error(cudaSetDevice(i)); + check_cuda_error(cudaDeviceEnablePeerAccess(0, 0)); + + check_cuda_error(cudaDeviceGetDefaultMemPool(&mempool, 0)); + desc.location.type = cudaMemLocationTypeDevice; + desc.location.id = i; + desc.flags = cudaMemAccessFlagsProtReadWrite; + check_cuda_error( + cudaMemPoolSetAccess(mempool, &desc, 1 /* numDescs */)); + num_used_gpus += 1; + } else { + break; + } + } + } else { + int has_peer_access_to_device_0; + for (int i = 1; i < num_gpus; i++) { + check_cuda_error( + cudaDeviceCanAccessPeer(&has_peer_access_to_device_0, i, 0)); + if (has_peer_access_to_device_0) { + num_used_gpus += 1; + } else { + break; + } + } + } + m.unlock(); + } + return num_used_gpus; +} + +int get_active_gpu_count(int num_inputs, int gpu_count) { + int active_gpu_count = gpu_count; + if (gpu_count > num_inputs) { + active_gpu_count = num_inputs; + } + return active_gpu_count; +} + +int get_gpu_offset(int total_num_inputs, int gpu_index, int gpu_count) { + int gpu_offset = 0; + for (uint i = 0; i < gpu_index; i++) + gpu_offset += get_num_inputs_on_gpu(total_num_inputs, i, gpu_count); + return gpu_offset; +} + +int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count) { + + int num_inputs = 0; + // If there are fewer inputs than GPUs, not all GPUs are active and GPU 0 + // handles everything + if (gpu_count > total_num_inputs) { + if (gpu_index < total_num_inputs) { + num_inputs = 1; + } + } else { + // If there are more inputs than GPUs, all GPUs are active and compute over + // a chunk of the total inputs. The chunk size is smaller on the last GPUs. + int small_input_num, large_input_num, cutoff; + if (total_num_inputs % gpu_count == 0) { + small_input_num = total_num_inputs / gpu_count; + large_input_num = small_input_num; + cutoff = 0; + } else { + int y = ceil((double)total_num_inputs / (double)gpu_count) * gpu_count - + total_num_inputs; + cutoff = gpu_count - y; + small_input_num = total_num_inputs / gpu_count; + large_input_num = (int)ceil((double)total_num_inputs / (double)gpu_count); + } + if (gpu_index < cutoff) + num_inputs = large_input_num; + else + num_inputs = small_input_num; + } + return num_inputs; +} diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/multi_gpu_helper.cu b/backends/tfhe-cuda-backend/cuda/src/utils/multi_gpu_helper.cu deleted file mode 100644 index eb008e37e..000000000 --- a/backends/tfhe-cuda-backend/cuda/src/utils/multi_gpu_helper.cu +++ /dev/null @@ -1,52 +0,0 @@ -#include "device.h" -#include "helper.h" - -int cuda_setup_multi_gpu() { - - int num_gpus = cuda_get_number_of_gpus(); - if (num_gpus == 0) - PANIC("GPU error: the number of GPUs should be > 0.") - - if (num_gpus > 1) { - int can_access_peer; - for (int i = 0; i < num_gpus; i++) { - cudaSetDevice(i); - for (int j = 0; i < num_gpus; i++) { - if (i == j) - break; - check_cuda_error(cudaDeviceCanAccessPeer(&can_access_peer, i, j)); - cudaDeviceEnablePeerAccess(j, 0); - - if (!can_access_peer) - PANIC("Multi GPU error: all GPUs should have peer access to GPU each " - "other.") - } - } - } - return num_gpus; -} - -void multi_gpu_checks(uint32_t gpu_count) { - - if (gpu_count == 0) - PANIC("GPU error: the number of GPUs should be > 0.") - - if (gpu_count > cuda_get_number_of_gpus()) - PANIC("Multi GPU error: the number of cuda streams should be lower than " - "the number of GPUs on the machine.") - - if (gpu_count > 1) { - int can_access_peer; - for (int i = 1; i < gpu_count; i++) { - cudaSetDevice(i); - for (int j = 0; i < gpu_count; i++) { - if (i == j) - break; - check_cuda_error(cudaDeviceCanAccessPeer(&can_access_peer, i, j)); - if (!can_access_peer) - PANIC("Multi GPU error: all GPUs should have peer access to GPU each " - "other.") - } - } - } -} diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_keyswitch.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_keyswitch.cpp index 2b36dc504..7fb1f7533 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_keyswitch.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_keyswitch.cpp @@ -1,3 +1,4 @@ +#include "helper_multi_gpu.h" #include #include #include @@ -19,6 +20,155 @@ typedef struct { int number_of_inputs; } KeyswitchTestParams; +class KeyswitchMultiGPUTestPrimitives_u64 + : public ::testing::TestWithParam { +protected: + int input_lwe_dimension; + int output_lwe_dimension; + DynamicDistribution noise_distribution; + int ksk_base_log; + int ksk_level; + int message_modulus; + int carry_modulus; + int number_of_inputs; + int payload_modulus; + uint64_t delta; + std::vector streams; + uint64_t *lwe_sk_in_array; + uint64_t *lwe_sk_out_array; + uint64_t *plaintexts; + uint64_t *d_ksk_array; + uint64_t *d_lwe_ct_out_array; + uint64_t *d_lwe_ct_in_array; + uint64_t *lwe_in_ct; + uint64_t *lwe_out_ct; + uint64_t *lwe_input_indexes; + uint64_t *lwe_output_indexes; + + // Data stays at gpu 0 + uint32_t gpu_index = 0; + uint gpu_count = 0; + uint active_gpu_count = 0; + uint base_num_inputs_on_gpu = 0; + +public: + // Test arithmetic functions + void SetUp() { + // TestParams + input_lwe_dimension = (int)GetParam().input_lwe_dimension; + output_lwe_dimension = (int)GetParam().output_lwe_dimension; + noise_distribution = (DynamicDistribution)GetParam().noise_distribution; + ksk_base_log = (int)GetParam().ksk_base_log; + ksk_level = (int)GetParam().ksk_level; + message_modulus = (int)GetParam().message_modulus; + carry_modulus = (int)GetParam().carry_modulus; + number_of_inputs = (int)GetParam().number_of_inputs; + + // Enable Multi-GPU logic + gpu_count = cuda_setup_multi_gpu(); + active_gpu_count = std::min((uint)number_of_inputs, gpu_count); + for (uint gpu_i = 0; gpu_i < active_gpu_count; gpu_i++) { + streams.push_back(cuda_create_stream(gpu_i)); + } + + Seed seed; + init_seed(&seed); + + base_num_inputs_on_gpu = + number_of_inputs / gpu_count + (number_of_inputs % gpu_count != 0); + lwe_out_ct = (uint64_t *)malloc((output_lwe_dimension + 1) * + number_of_inputs * sizeof(uint64_t)); + + keyswitch_setup(streams[0], gpu_index, &seed, &lwe_sk_in_array, + &lwe_sk_out_array, &d_ksk_array, &plaintexts, + &d_lwe_ct_in_array, &lwe_input_indexes, &d_lwe_ct_out_array, + &lwe_output_indexes, input_lwe_dimension, + output_lwe_dimension, noise_distribution, ksk_base_log, + ksk_level, message_modulus, carry_modulus, &payload_modulus, + &delta, number_of_inputs, REPETITIONS, SAMPLES); + cuda_synchronize_stream(streams[0], gpu_index); + } + + void TearDown() { + keyswitch_teardown(streams[0], gpu_index, lwe_sk_in_array, lwe_sk_out_array, + d_ksk_array, plaintexts, d_lwe_ct_in_array, + lwe_input_indexes, d_lwe_ct_out_array, + lwe_output_indexes); + if (active_gpu_count > 1) { + for (uint gpu_i = 1; gpu_i < active_gpu_count; gpu_i++) { + cuda_destroy_stream(streams[gpu_i], gpu_i); + } + } + free(lwe_out_ct); + } +}; + +TEST_P(KeyswitchMultiGPUTestPrimitives_u64, keyswitch) { + for (uint r = 0; r < REPETITIONS; r++) { + uint64_t *lwe_out_sk = + lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension); + int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension; + uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r); + for (uint s = 0; s < SAMPLES; s++) { + uint64_t *d_lwe_ct_in = + d_lwe_ct_in_array + + (ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) * + (input_lwe_dimension + 1)); + +#pragma omp parallel for num_threads(active_gpu_count) + for (uint gpu_i = 0; gpu_i < active_gpu_count; gpu_i++) { + auto num_inputs = base_num_inputs_on_gpu; + /// If the index reaches the last GPU, add the remainder of inputs/gpus + /// to the number of inputs on the last GPU + if (gpu_i == gpu_count - 1) + num_inputs = + number_of_inputs - base_num_inputs_on_gpu * (gpu_count - 1); + + auto input_lwe_start_index = + gpu_i * base_num_inputs_on_gpu * (input_lwe_dimension + 1); + auto output_lwe_start_index = + gpu_i * base_num_inputs_on_gpu * (output_lwe_dimension + 1); + + auto d_lwe_ct_in_slice = + d_lwe_ct_in + (ptrdiff_t)(input_lwe_start_index); + auto d_lwe_ct_out = + d_lwe_ct_out_array + (ptrdiff_t)(output_lwe_start_index); + + // Execute keyswitch + cuda_keyswitch_lwe_ciphertext_vector_64( + streams[gpu_i], gpu_i, d_lwe_ct_out, lwe_output_indexes, + d_lwe_ct_in_slice, lwe_input_indexes, d_ksk, input_lwe_dimension, + output_lwe_dimension, ksk_base_log, ksk_level, num_inputs); + } + for (uint gpu_i = 0; gpu_i < active_gpu_count; gpu_i++) { + cuda_synchronize_stream(streams[gpu_i], gpu_i); + } + // Copy result back + cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_ct_out_array, + number_of_inputs * (output_lwe_dimension + 1) * + sizeof(uint64_t), + streams[0], 0); + cuda_synchronize_stream(streams[0], 0); + + for (int i = 0; i < number_of_inputs; i++) { + uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs + + s * number_of_inputs + i]; + uint64_t decrypted = 0; + core_crypto_lwe_decrypt(&decrypted, + lwe_out_ct + i * (output_lwe_dimension + 1), + lwe_out_sk, output_lwe_dimension); + ASSERT_NE(decrypted, plaintext); + // The bit before the message + uint64_t rounding_bit = delta >> 1; + // Compute the rounding bit + uint64_t rounding = (decrypted & rounding_bit) << 1; + uint64_t decoded = (decrypted + rounding) / delta; + ASSERT_EQ(decoded, plaintext / delta) << "Index " << i << " is wrong"; + } + } + } +} + class KeyswitchTestPrimitives_u64 : public ::testing::TestWithParam { protected: @@ -161,3 +311,6 @@ std::string printParamName(::testing::TestParamInfo p) { INSTANTIATE_TEST_CASE_P(KeyswitchInstantiation, KeyswitchTestPrimitives_u64, ksk_params_u64, printParamName); +INSTANTIATE_TEST_CASE_P(KeyswitchInstantiation, + KeyswitchMultiGPUTestPrimitives_u64, ksk_params_u64, + printParamName); diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp index 95d07e58f..30cef8f01 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp @@ -137,7 +137,7 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64, (void *)d_lwe_input_indexes, (void *)d_bsk, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, pbs_base_log, pbs_level, number_of_inputs, 1, 0, - cuda_get_max_shared_memory(gpu_index)); + cuda_get_max_shared_memory(gpu_index), 0); // Copy result to the host memory cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array, diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index 021b795bd..1a003029e 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -228,6 +228,7 @@ extern "C" { num_lut_vectors: u32, lwe_idx: u32, max_shared_memory: u32, + gpu_offset: u32, ); /// This cleanup function frees the data for the low latency PBS on GPU @@ -315,6 +316,7 @@ extern "C" { num_lut_vectors: u32, lwe_idx: u32, max_shared_memory: u32, + gpu_offset: u32, lwe_chunk_size: u32, ); @@ -356,6 +358,7 @@ extern "C" { base_log: u32, level_count: u32, num_samples: u32, + gpu_offset: u32, ); /// Perform the negation of a u64 input LWE ciphertext vector. @@ -494,8 +497,9 @@ extern "C" { ); pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, message_modulus: u32, carry_modulus: u32, @@ -520,14 +524,19 @@ extern "C" { radix_lwe_out: *mut c_void, radix_lwe_left: *const c_void, radix_lwe_right: *const c_void, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, mem_ptr: *mut i8, polynomial_size: u32, num_blocks: u32, ); - pub fn cleanup_cuda_integer_mult(stream: *mut c_void, gpu_index: u32, mem_ptr: *mut *mut i8); + pub fn cleanup_cuda_integer_mult( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + ); pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace( streams: *const *mut c_void, @@ -542,8 +551,9 @@ extern "C" { ); pub fn scratch_cuda_integer_scalar_mul_kb_64( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, @@ -568,8 +578,8 @@ extern "C" { decomposed_scalar: *const u64, has_at_least_one_set: *const u64, mem: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, lwe_dimension: u32, polynomial_size: u32, message_modulus: u32, @@ -578,14 +588,16 @@ extern "C" { ); pub fn cleanup_cuda_integer_radix_scalar_mul( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, ); pub fn scratch_cuda_integer_radix_bitop_kb_64( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, @@ -612,8 +624,8 @@ extern "C" { radix_lwe_left: *const c_void, radix_lwe_right: *const c_void, mem_ptr: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, num_blocks: u32, ); @@ -624,8 +636,8 @@ extern "C" { radix_lwe_out: *mut c_void, radix_lwe_in: *const c_void, mem_ptr: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, num_blocks: u32, ); @@ -638,17 +650,23 @@ extern "C" { clear_blocks: *const c_void, num_clear_blocks: u32, mem_ptr: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, num_blocks: u32, op_type: u32, ); - pub fn cleanup_cuda_integer_bitop(stream: *mut c_void, gpu_index: u32, mem_ptr: *mut *mut i8); + pub fn cleanup_cuda_integer_bitop( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + ); pub fn scratch_cuda_integer_radix_comparison_kb_64( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, @@ -676,14 +694,15 @@ extern "C" { radix_lwe_left: *const c_void, radix_lwe_right: *const c_void, mem_ptr: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, num_blocks: u32, ); pub fn cleanup_cuda_integer_comparison( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, ); @@ -695,8 +714,8 @@ extern "C" { radix_lwe_in: *const c_void, scalar_blocks: *const c_void, mem_ptr: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, num_blocks: u32, num_scalar_blocks: u32, ); @@ -723,8 +742,8 @@ extern "C" { gpu_count: u32, radix_lwe_right: *mut c_void, mem_ptr: *mut i8, - ksk: *const c_void, - bsk: *const c_void, + ksks: *const *mut c_void, + bsks: *const *mut c_void, lwe_dimension: u32, glwe_dimension: u32, polynomial_size: u32, @@ -743,8 +762,9 @@ extern "C" { ); pub fn scratch_cuda_apply_univariate_lut_kb_64( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, input_lut: *const c_void, lwe_dimension: u32, @@ -769,20 +789,22 @@ extern "C" { output_radix_lwe: *mut c_void, input_radix_lwe: *const c_void, mem_ptr: *mut i8, - ksk: *const c_void, - bsk: *const c_void, + ksks: *const *mut c_void, + bsks: *const *mut c_void, num_blocks: u32, ); pub fn cleanup_cuda_apply_univariate_lut_kb_64( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, ); pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, @@ -808,14 +830,15 @@ extern "C" { radix_lwe: *mut c_void, shift: u32, mem_ptr: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, num_blocks: u32, ); pub fn scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, @@ -841,26 +864,29 @@ extern "C" { radix_lwe: *mut c_void, shift: u32, mem_ptr: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, num_blocks: u32, ); pub fn cleanup_cuda_integer_radix_logical_scalar_shift( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, ); pub fn cleanup_cuda_integer_radix_arithmetic_scalar_shift( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, ); pub fn scratch_cuda_integer_radix_shift_and_rotate_kb_64( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, @@ -887,20 +913,22 @@ extern "C" { radix_lwe: *mut c_void, radix_shift: *const c_void, mem_ptr: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, num_blocks: u32, ); pub fn cleanup_cuda_integer_radix_shift_and_rotate( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, ); pub fn scratch_cuda_integer_radix_cmux_kb_64( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, @@ -927,20 +955,22 @@ extern "C" { lwe_array_true: *const c_void, lwe_array_false: *const c_void, mem_ptr: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, num_blocks: u32, ); pub fn cleanup_cuda_integer_radix_cmux( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, ); pub fn scratch_cuda_integer_radix_scalar_rotate_kb_64( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, @@ -966,20 +996,22 @@ extern "C" { radix_lwe: *mut c_void, n: u32, mem_ptr: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, num_blocks: u32, ); pub fn cleanup_cuda_integer_radix_scalar_rotate( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, ); pub fn scratch_cuda_propagate_single_carry_kb_64_inplace( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, @@ -1004,20 +1036,22 @@ extern "C" { radix_lwe: *mut c_void, carry_out: *mut c_void, mem_ptr: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, num_blocks: u32, ); pub fn cleanup_cuda_propagate_single_carry( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, ); pub fn scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, @@ -1043,20 +1077,22 @@ extern "C" { radix_lwe_vec: *mut c_void, num_radix_in_vec: u32, mem_ptr: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, num_blocks_in_radix: u32, ); pub fn cleanup_cuda_integer_radix_sum_ciphertexts_vec( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, ); pub fn scratch_cuda_integer_radix_overflowing_sub_kb_64( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, @@ -1083,20 +1119,22 @@ extern "C" { radix_lwe_left: *const c_void, radix_lwe_right: *const c_void, mem_ptr: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, num_blocks: u32, ); pub fn cleanup_cuda_integer_radix_overflowing_sub( - stream: *mut c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, ); pub fn scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( - stream: *const c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, @@ -1123,14 +1161,15 @@ extern "C" { numerator: *const c_void, divisor: *const c_void, mem_ptr: *mut i8, - bsk: *const c_void, - ksk: *const c_void, + bsks: *const *mut c_void, + ksks: *const *mut c_void, num_blocks: u32, ); pub fn cleanup_cuda_integer_div_rem( - stream: *const c_void, - gpu_index: u32, + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, mem_ptr: *mut *mut i8, ); diff --git a/ci/slab.toml b/ci/slab.toml index 62d3f842b..e47d7cc7b 100644 --- a/ci/slab.toml +++ b/ci/slab.toml @@ -31,6 +31,24 @@ environment_name = "canada" image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2" flavor_name = "n3-H100x1" +[backend.hyperstack.multi-h100-nvlink] +environment_name = "canada" +image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2" +flavor_name = "n3-H100x8-NVLink-K8s" + +[backend.hyperstack.multi-a100-nvlink] +environment_name = "canada" +image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2" +flavor_name = "n3-A100x8-NVLink" + +[backend.aws.multi-gpu-test] +region = "us-east-1" +image_id = "ami-0c0bf195ca4c175b6" +instance_type = "p3.8xlarge" +# One spawn attempt every 30 seconds for 1 hour +spawn_retry_attempts = 120 +spawn_retry_duration = 60 + [command.signed_integer_full_bench] workflow = "signed_integer_full_benchmark.yml" profile = "bench" diff --git a/tfhe/benches/core_crypto/ks_bench.rs b/tfhe/benches/core_crypto/ks_bench.rs index 0f03c50a8..df70961d1 100644 --- a/tfhe/benches/core_crypto/ks_bench.rs +++ b/tfhe/benches/core_crypto/ks_bench.rs @@ -256,11 +256,11 @@ mod cuda { CudaLweCiphertextList::from_lwe_ciphertext(&output_ct, &streams); let h_indexes = &[Scalar::ZERO]; - let mut d_input_indexes = unsafe { CudaVec::::new_async(1, &streams) }; - let mut d_output_indexes = unsafe { CudaVec::::new_async(1, &streams) }; + let mut d_input_indexes = unsafe { CudaVec::::new_async(1, &streams, 0) }; + let mut d_output_indexes = unsafe { CudaVec::::new_async(1, &streams, 0) }; unsafe { - d_input_indexes.copy_from_cpu_async(h_indexes.as_ref(), &streams); - d_output_indexes.copy_from_cpu_async(h_indexes.as_ref(), &streams); + d_input_indexes.copy_from_cpu_async(h_indexes.as_ref(), &streams, 0); + d_output_indexes.copy_from_cpu_async(h_indexes.as_ref(), &streams, 0); } streams.synchronize(); diff --git a/tfhe/benches/core_crypto/pbs_bench.rs b/tfhe/benches/core_crypto/pbs_bench.rs index f8a6027a3..8e95a6ee1 100644 --- a/tfhe/benches/core_crypto/pbs_bench.rs +++ b/tfhe/benches/core_crypto/pbs_bench.rs @@ -851,13 +851,13 @@ mod cuda { CudaLweCiphertextList::from_lwe_ciphertext(&out_pbs_ct, &stream); let h_indexes = &[Scalar::ZERO]; stream.synchronize(); - let mut d_input_indexes = unsafe { CudaVec::::new_async(1, &stream) }; - let mut d_output_indexes = unsafe { CudaVec::::new_async(1, &stream) }; - let mut d_lut_indexes = unsafe { CudaVec::::new_async(1, &stream) }; + let mut d_input_indexes = unsafe { CudaVec::::new_async(1, &stream, 0) }; + let mut d_output_indexes = unsafe { CudaVec::::new_async(1, &stream, 0) }; + let mut d_lut_indexes = unsafe { CudaVec::::new_async(1, &stream, 0) }; unsafe { - d_input_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream); - d_output_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream); - d_lut_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream); + d_input_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream, 0); + d_output_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream, 0); + d_lut_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream, 0); } stream.synchronize(); @@ -978,13 +978,13 @@ mod cuda { CudaLweCiphertextList::from_lwe_ciphertext(&out_pbs_ct, &stream); let h_indexes = &[Scalar::ZERO]; stream.synchronize(); - let mut d_input_indexes = unsafe { CudaVec::::new_async(1, &stream) }; - let mut d_output_indexes = unsafe { CudaVec::::new_async(1, &stream) }; - let mut d_lut_indexes = unsafe { CudaVec::::new_async(1, &stream) }; + let mut d_input_indexes = unsafe { CudaVec::::new_async(1, &stream, 0) }; + let mut d_output_indexes = unsafe { CudaVec::::new_async(1, &stream, 0) }; + let mut d_lut_indexes = unsafe { CudaVec::::new_async(1, &stream, 0) }; unsafe { - d_input_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream); - d_output_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream); - d_lut_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream); + d_input_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream, 0); + d_output_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream, 0); + d_lut_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream, 0); } stream.synchronize(); @@ -1111,20 +1111,20 @@ mod cuda { let mut out_pbs_ct_gpu = CudaLweCiphertextList::from_lwe_ciphertext_list(&output_lwe_list, &stream); let mut h_indexes: [Scalar; NUM_CTS] = [Scalar::ZERO; NUM_CTS]; - let mut d_lut_indexes = unsafe { CudaVec::::new_async(NUM_CTS, &stream) }; + let mut d_lut_indexes = unsafe { CudaVec::::new_async(NUM_CTS, &stream, 0) }; unsafe { - d_lut_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream); + d_lut_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream, 0); } stream.synchronize(); for (i, index) in h_indexes.iter_mut().enumerate() { *index = Scalar::cast_from(i); } stream.synchronize(); - let mut d_input_indexes = unsafe { CudaVec::::new_async(NUM_CTS, &stream) }; - let mut d_output_indexes = unsafe { CudaVec::::new_async(NUM_CTS, &stream) }; + let mut d_input_indexes = unsafe { CudaVec::::new_async(NUM_CTS, &stream, 0) }; + let mut d_output_indexes = unsafe { CudaVec::::new_async(NUM_CTS, &stream, 0) }; unsafe { - d_input_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream); - d_output_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream); + d_input_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream, 0); + d_output_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream, 0); } stream.synchronize(); @@ -1256,20 +1256,20 @@ mod cuda { let mut out_pbs_ct_gpu = CudaLweCiphertextList::from_lwe_ciphertext_list(&output_lwe_list, &stream); let mut h_indexes: [Scalar; NUM_CTS] = [Scalar::ZERO; NUM_CTS]; - let mut d_lut_indexes = unsafe { CudaVec::::new_async(NUM_CTS, &stream) }; + let mut d_lut_indexes = unsafe { CudaVec::::new_async(NUM_CTS, &stream, 0) }; unsafe { - d_lut_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream); + d_lut_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream, 0); } stream.synchronize(); for (i, index) in h_indexes.iter_mut().enumerate() { *index = Scalar::cast_from(i); } stream.synchronize(); - let mut d_input_indexes = unsafe { CudaVec::::new_async(NUM_CTS, &stream) }; - let mut d_output_indexes = unsafe { CudaVec::::new_async(NUM_CTS, &stream) }; + let mut d_input_indexes = unsafe { CudaVec::::new_async(NUM_CTS, &stream, 0) }; + let mut d_output_indexes = unsafe { CudaVec::::new_async(NUM_CTS, &stream, 0) }; unsafe { - d_input_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream); - d_output_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream); + d_input_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream, 0); + d_output_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream, 0); } stream.synchronize(); diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_keyswitch.rs b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_keyswitch.rs index 73f71765d..bb79bd79f 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_keyswitch.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_keyswitch.rs @@ -89,10 +89,12 @@ fn lwe_encrypt_ks_decrypt_custom_mod>( .iter() .map(|&x| >::cast_into(x)) .collect_vec(); - let mut d_input_indexes = unsafe { CudaVec::::new_async(num_blocks, &stream) }; - let mut d_output_indexes = unsafe { CudaVec::::new_async(num_blocks, &stream) }; - unsafe { d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream) }; - unsafe { d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream) }; + let mut d_input_indexes = + unsafe { CudaVec::::new_async(num_blocks, &stream, 0) }; + let mut d_output_indexes = + unsafe { CudaVec::::new_async(num_blocks, &stream, 0) }; + unsafe { d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0) }; + unsafe { d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0) }; cuda_keyswitch_lwe_ciphertext( &d_ksk_big_to_small, diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping.rs b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping.rs index 2169225e6..91c0712ce 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping.rs @@ -130,8 +130,8 @@ fn lwe_encrypt_multi_bit_pbs_decrypt_custom_mod< } let mut d_test_vector_indexes = - unsafe { CudaVec::::new_async(number_of_messages, &stream) }; - unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream) }; + unsafe { CudaVec::::new_async(number_of_messages, &stream, 0) }; + unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) }; let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0; let lwe_indexes_usize: Vec = (0..num_blocks).collect_vec(); @@ -139,11 +139,13 @@ fn lwe_encrypt_multi_bit_pbs_decrypt_custom_mod< .iter() .map(|&x| >::cast_into(x)) .collect_vec(); - let mut d_output_indexes = unsafe { CudaVec::::new_async(num_blocks, &stream) }; - let mut d_input_indexes = unsafe { CudaVec::::new_async(num_blocks, &stream) }; + let mut d_output_indexes = + unsafe { CudaVec::::new_async(num_blocks, &stream, 0) }; + let mut d_input_indexes = + unsafe { CudaVec::::new_async(num_blocks, &stream, 0) }; unsafe { - d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream); - d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream); + d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0); + d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0); } cuda_multi_bit_programmable_bootstrap_lwe_ciphertext( diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping.rs b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping.rs index 9b699b190..b1a1c8a50 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping.rs @@ -127,8 +127,8 @@ fn lwe_encrypt_pbs_decrypt< } let mut d_test_vector_indexes = - unsafe { CudaVec::::new_async(number_of_messages, &stream) }; - unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream) }; + unsafe { CudaVec::::new_async(number_of_messages, &stream, 0) }; + unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) }; let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0; let lwe_indexes_usize: Vec = (0..num_blocks).collect_vec(); @@ -136,11 +136,13 @@ fn lwe_encrypt_pbs_decrypt< .iter() .map(|&x| >::cast_into(x)) .collect_vec(); - let mut d_output_indexes = unsafe { CudaVec::::new_async(num_blocks, &stream) }; - let mut d_input_indexes = unsafe { CudaVec::::new_async(num_blocks, &stream) }; + let mut d_output_indexes = + unsafe { CudaVec::::new_async(num_blocks, &stream, 0) }; + let mut d_input_indexes = + unsafe { CudaVec::::new_async(num_blocks, &stream, 0) }; unsafe { - d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream); - d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream); + d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0); + d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0); } cuda_programmable_bootstrap_lwe_ciphertext( diff --git a/tfhe/src/core_crypto/gpu/entities/glwe_ciphertext_list.rs b/tfhe/src/core_crypto/gpu/entities/glwe_ciphertext_list.rs index c262a361c..bde72221b 100644 --- a/tfhe/src/core_crypto/gpu/entities/glwe_ciphertext_list.rs +++ b/tfhe/src/core_crypto/gpu/entities/glwe_ciphertext_list.rs @@ -23,6 +23,7 @@ impl CudaGlweCiphertextList { glwe_ciphertext_size(glwe_dimension.to_glwe_size(), polynomial_size) * glwe_ciphertext_count.0, streams, + 0, ); let cuda_glwe_list = CudaGlweList { d_vec, @@ -48,10 +49,11 @@ impl CudaGlweCiphertextList { glwe_ciphertext_size(glwe_dimension.to_glwe_size(), polynomial_size) * glwe_ciphertext_count.0, streams, + 0, ); // Copy to the GPU unsafe { - d_vec.copy_from_cpu_async(h_ct.as_ref(), streams); + d_vec.copy_from_cpu_async(h_ct.as_ref(), streams, 0); } streams.synchronize(); @@ -77,7 +79,7 @@ impl CudaGlweCiphertextList { unsafe { self.0 .d_vec - .copy_to_cpu_async(container.as_mut_slice(), streams); + .copy_to_cpu_async(container.as_mut_slice(), streams, 0); streams.synchronize(); } @@ -102,12 +104,13 @@ impl CudaGlweCiphertextList { glwe_ciphertext_size(glwe_dimension.to_glwe_size(), polynomial_size) * glwe_ciphertext_count.0, streams, + 0, ); // Copy to the GPU let h_input = h_ct.as_view().into_container(); unsafe { - d_vec.copy_from_cpu_async(h_input.as_ref(), streams); + d_vec.copy_from_cpu_async(h_input.as_ref(), streams, 0); } streams.synchronize(); diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_bootstrap_key.rs b/tfhe/src/core_crypto/gpu/entities/lwe_bootstrap_key.rs index 8730efc1b..0d2860c33 100644 --- a/tfhe/src/core_crypto/gpu/entities/lwe_bootstrap_key.rs +++ b/tfhe/src/core_crypto/gpu/entities/lwe_bootstrap_key.rs @@ -39,7 +39,7 @@ impl CudaLweBootstrapKey { let glwe_dimension = bsk.glwe_size().to_glwe_dimension(); // Allocate memory - let mut d_vec = CudaVec::::new( + let mut d_vec = CudaVec::::new_multi_gpu( lwe_bootstrap_key_size( input_lwe_dimension, glwe_dimension.to_glwe_size(), diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs index fd1ffa04b..0f71d1749 100644 --- a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs +++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs @@ -23,6 +23,7 @@ impl CudaLweCiphertextList { CudaVec::new_async( lwe_dimension.to_lwe_size().0 * lwe_ciphertext_count.0, streams, + 0, ) }; streams.synchronize(); @@ -50,9 +51,10 @@ impl CudaLweCiphertextList { let mut d_vec = CudaVec::new( lwe_dimension.to_lwe_size().0 * lwe_ciphertext_count.0, streams, + 0, ); unsafe { - d_vec.copy_from_cpu_async(h_input.as_ref(), streams); + d_vec.copy_from_cpu_async(h_input.as_ref(), streams, 0); } streams.synchronize(); let cuda_lwe_list = CudaLweList { @@ -103,8 +105,9 @@ impl CudaLweCiphertextList { let mut d_vec = CudaVec::new( lwe_dimension.to_lwe_size().0 * lwe_ciphertext_count.0, streams, + 0, ); - let mut ptr = d_vec.as_mut_c_ptr(); + let mut ptr = d_vec.as_mut_c_ptr(0); let size = first_item.lwe_ciphertext_count().0 * lwe_dimension.to_lwe_size().0 * std::mem::size_of::(); @@ -112,7 +115,7 @@ impl CudaLweCiphertextList { unsafe { cuda_memcpy_async_gpu_to_gpu( ptr, - first_item.0.d_vec.as_c_ptr(), + first_item.0.d_vec.as_c_ptr(0), size as u64, streams.ptr[0], streams.gpu_indexes[0], @@ -121,7 +124,7 @@ impl CudaLweCiphertextList { for list in cuda_ciphertexts_list_vec { cuda_memcpy_async_gpu_to_gpu( ptr, - list.0.d_vec.as_c_ptr(), + list.0.d_vec.as_c_ptr(0), size as u64, streams.ptr[0], streams.gpu_indexes[0], @@ -147,7 +150,7 @@ impl CudaLweCiphertextList { unsafe { self.0 .d_vec - .copy_to_cpu_async(container.as_mut_slice(), streams); + .copy_to_cpu_async(container.as_mut_slice(), streams, 0); } streams.synchronize(); @@ -167,9 +170,9 @@ impl CudaLweCiphertextList { let ciphertext_modulus = h_ct.ciphertext_modulus(); // Copy to the GPU - let mut d_vec = CudaVec::new(lwe_dimension.to_lwe_size().0, streams); + let mut d_vec = CudaVec::new(lwe_dimension.to_lwe_size().0, streams, 0); unsafe { - d_vec.copy_from_cpu_async(h_ct.as_ref(), streams); + d_vec.copy_from_cpu_async(h_ct.as_ref(), streams, 0); } streams.synchronize(); @@ -189,7 +192,7 @@ impl CudaLweCiphertextList { unsafe { self.0 .d_vec - .copy_to_cpu_async(container.as_mut_slice(), streams); + .copy_to_cpu_async(container.as_mut_slice(), streams, 0); } streams.synchronize(); @@ -226,9 +229,9 @@ impl CudaLweCiphertextList { let ciphertext_modulus = self.ciphertext_modulus(); // Copy to the GPU - let mut d_vec = CudaVec::new(self.0.d_vec.len(), streams); + let mut d_vec = CudaVec::new(self.0.d_vec.len(), streams, 0); unsafe { - d_vec.copy_from_gpu_async(&self.0.d_vec, streams); + d_vec.copy_from_gpu_async(&self.0.d_vec, streams, 0); } streams.synchronize(); diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_keyswitch_key.rs b/tfhe/src/core_crypto/gpu/entities/lwe_keyswitch_key.rs index 50e198322..fd37d09a0 100644 --- a/tfhe/src/core_crypto/gpu/entities/lwe_keyswitch_key.rs +++ b/tfhe/src/core_crypto/gpu/entities/lwe_keyswitch_key.rs @@ -29,7 +29,7 @@ impl CudaLweKeyswitchKey { let ciphertext_modulus = h_ksk.ciphertext_modulus(); // Allocate memory - let mut d_vec = CudaVec::::new( + let mut d_vec = CudaVec::::new_multi_gpu( input_lwe_size.to_lwe_dimension().0 * lwe_keyswitch_key_input_key_element_encrypted_size( decomp_level_count, diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_multi_bit_bootstrap_key.rs b/tfhe/src/core_crypto/gpu/entities/lwe_multi_bit_bootstrap_key.rs index 04740d907..3d23c68ea 100644 --- a/tfhe/src/core_crypto/gpu/entities/lwe_multi_bit_bootstrap_key.rs +++ b/tfhe/src/core_crypto/gpu/entities/lwe_multi_bit_bootstrap_key.rs @@ -43,19 +43,17 @@ impl CudaLweMultiBitBootstrapKey { let grouping_factor = bsk.grouping_factor(); // Allocate memory - let mut d_vec = unsafe { - CudaVec::::new_async( - lwe_multi_bit_bootstrap_key_size( - input_lwe_dimension, - glwe_dimension.to_glwe_size(), - polynomial_size, - decomp_level_count, - grouping_factor, - ) - .unwrap(), - streams, + let mut d_vec = CudaVec::::new_multi_gpu( + lwe_multi_bit_bootstrap_key_size( + input_lwe_dimension, + glwe_dimension.to_glwe_size(), + polynomial_size, + decomp_level_count, + grouping_factor, ) - }; + .unwrap(), + streams, + ); // Copy to the GPU unsafe { convert_lwe_multi_bit_programmable_bootstrap_key_async( diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 1e0fde4ce..8eb516f2a 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -11,14 +11,20 @@ use crate::core_crypto::prelude::{ }; pub use algorithms::*; pub use entities::*; +use rayon::prelude::*; use std::ffi::c_void; pub(crate) use tfhe_cuda_backend::cuda_bind::*; +#[derive(Debug)] pub struct CudaStreams { pub ptr: Vec<*mut c_void>, pub gpu_indexes: Vec, } +#[allow(clippy::non_send_fields_in_send_ty)] +unsafe impl Send for CudaStreams {} +unsafe impl Sync for CudaStreams {} + impl CudaStreams { /// Create a new `CudaStreams` structure with as many GPUs as there are on the machine, /// if they are connected via Nvlink. If the multiple GPUs on the machine are not connected @@ -53,6 +59,15 @@ impl CudaStreams { } } } + /// Synchronize one cuda streams in the `CudaStreams` structure + pub fn synchronize_one(&self, gpu_index: u32) { + unsafe { + cuda_synchronize_stream( + self.ptr[gpu_index as usize], + self.gpu_indexes[gpu_index as usize], + ); + } + } /// Return the number of GPU indexes, which is the same as the number of Cuda streams pub fn len(&self) -> usize { self.gpu_indexes.len() @@ -63,14 +78,15 @@ impl CudaStreams { } } -/// This structure allows to distinguish between a constant raw pointer that points the -/// CPU memory vs GPU Memory. -#[derive(Debug, Copy, Clone)] -pub struct CudaPtr(pub(crate) *const c_void); -/// This structure allows to distinguish between a mutable raw pointer that points the -/// CPU memory vs GPU Memory. -#[derive(Debug, Copy, Clone)] -pub struct CudaPtrMut(pub(crate) *mut c_void); +impl Drop for CudaStreams { + fn drop(&mut self) { + for (i, &s) in self.ptr.iter().enumerate() { + unsafe { + cuda_destroy_stream(s, self.gpu_indexes[i]); + } + } + } +} /// Discarding bootstrap on a vector of LWE ciphertexts /// @@ -111,13 +127,13 @@ pub unsafe fn programmable_bootstrap_async( cuda_programmable_bootstrap_lwe_ciphertext_vector_64( streams.ptr[0], streams.gpu_indexes[0], - lwe_array_out.as_mut_c_ptr(), - lwe_out_indexes.as_c_ptr(), - test_vector.as_c_ptr(), - test_vector_indexes.as_c_ptr(), - lwe_array_in.as_c_ptr(), - lwe_in_indexes.as_c_ptr(), - bootstrapping_key.as_c_ptr(), + lwe_array_out.as_mut_c_ptr(0), + lwe_out_indexes.as_c_ptr(0), + test_vector.as_c_ptr(0), + test_vector_indexes.as_c_ptr(0), + lwe_array_in.as_c_ptr(0), + lwe_in_indexes.as_c_ptr(0), + bootstrapping_key.as_c_ptr(0), pbs_buffer, lwe_dimension.0 as u32, glwe_dimension.0 as u32, @@ -128,6 +144,7 @@ pub unsafe fn programmable_bootstrap_async( num_samples, lwe_idx.0 as u32, get_max_shared_memory(streams.gpu_indexes[0]) as u32, + 0, ); cleanup_cuda_programmable_bootstrap( streams.ptr[0], @@ -179,13 +196,13 @@ pub unsafe fn programmable_bootstrap_multi_bit_async( cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( streams.ptr[0], streams.gpu_indexes[0], - lwe_array_out.as_mut_c_ptr(), - output_indexes.as_c_ptr(), - test_vector.as_c_ptr(), - test_vector_indexes.as_c_ptr(), - lwe_array_in.as_c_ptr(), - input_indexes.as_c_ptr(), - bootstrapping_key.as_c_ptr(), + lwe_array_out.as_mut_c_ptr(0), + output_indexes.as_c_ptr(0), + test_vector.as_c_ptr(0), + test_vector_indexes.as_c_ptr(0), + lwe_array_in.as_c_ptr(0), + input_indexes.as_c_ptr(0), + bootstrapping_key.as_c_ptr(0), pbs_buffer, lwe_dimension.0 as u32, glwe_dimension.0 as u32, @@ -198,6 +215,7 @@ pub unsafe fn programmable_bootstrap_multi_bit_async( lwe_idx.0 as u32, get_max_shared_memory(0) as u32, 0u32, + 0, ); cleanup_cuda_multi_bit_programmable_bootstrap( streams.ptr[0], @@ -229,16 +247,17 @@ pub unsafe fn keyswitch_async( cuda_keyswitch_lwe_ciphertext_vector_64( streams.ptr[0], streams.gpu_indexes[0], - lwe_array_out.as_mut_c_ptr(), - lwe_out_indexes.as_c_ptr(), - lwe_array_in.as_c_ptr(), - lwe_in_indexes.as_c_ptr(), - keyswitch_key.as_c_ptr(), + lwe_array_out.as_mut_c_ptr(0), + lwe_out_indexes.as_c_ptr(0), + lwe_array_in.as_c_ptr(0), + lwe_in_indexes.as_c_ptr(0), + keyswitch_key.as_c_ptr(0), input_lwe_dimension.0 as u32, output_lwe_dimension.0 as u32, base_log.0 as u32, l_gadget.0 as u32, num_samples, + 0, ); } @@ -254,7 +273,7 @@ pub unsafe fn convert_lwe_keyswitch_key_async( dest: &mut CudaVec, src: &[T], ) { - dest.copy_from_cpu_async(src, streams); + dest.copy_from_cpu_multi_gpu_async(src, streams); } /// Convert programmable bootstrap key @@ -274,18 +293,19 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async( polynomial_size: PolynomialSize, ) { let size = std::mem::size_of_val(src); - assert_eq!(dest.len() * std::mem::size_of::(), size); - - cuda_convert_lwe_programmable_bootstrap_key_64( - streams.ptr[0], - streams.gpu_indexes[0], - dest.as_mut_c_ptr(), - src.as_ptr().cast(), - input_lwe_dim.0 as u32, - glwe_dim.0 as u32, - l_gadget.0 as u32, - polynomial_size.0 as u32, - ); + streams.gpu_indexes.par_iter().for_each(|&gpu_index| { + assert_eq!(dest.len() * std::mem::size_of::(), size); + cuda_convert_lwe_programmable_bootstrap_key_64( + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], + dest.get_mut_c_ptr(gpu_index), + src.as_ptr().cast(), + input_lwe_dim.0 as u32, + glwe_dim.0 as u32, + l_gadget.0 as u32, + polynomial_size.0 as u32, + ); + }); } /// Convert multi-bit programmable bootstrap key @@ -306,18 +326,20 @@ pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async(), size); - cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( - streams.ptr[0], - streams.gpu_indexes[0], - dest.as_mut_c_ptr(), - src.as_ptr().cast(), - input_lwe_dim.0 as u32, - glwe_dim.0 as u32, - l_gadget.0 as u32, - polynomial_size.0 as u32, - grouping_factor.0 as u32, - ) + for &gpu_index in streams.gpu_indexes.iter() { + assert_eq!(dest.len() * std::mem::size_of::(), size); + cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], + dest.as_mut_c_ptr(gpu_index), + src.as_ptr().cast(), + input_lwe_dim.0 as u32, + glwe_dim.0 as u32, + l_gadget.0 as u32, + polynomial_size.0 as u32, + grouping_factor.0 as u32, + ); + } } /// Discarding addition of a vector of LWE ciphertexts @@ -337,9 +359,9 @@ pub unsafe fn add_lwe_ciphertext_vector_async( cuda_add_lwe_ciphertext_vector_64( streams.ptr[0], streams.gpu_indexes[0], - lwe_array_out.as_mut_c_ptr(), - lwe_array_in_1.as_c_ptr(), - lwe_array_in_2.as_c_ptr(), + lwe_array_out.as_mut_c_ptr(0), + lwe_array_in_1.as_c_ptr(0), + lwe_array_in_2.as_c_ptr(0), lwe_dimension.0 as u32, num_samples, ); @@ -361,9 +383,9 @@ pub unsafe fn add_lwe_ciphertext_vector_assign_async( cuda_add_lwe_ciphertext_vector_64( streams.ptr[0], streams.gpu_indexes[0], - lwe_array_out.as_mut_c_ptr(), - lwe_array_out.as_c_ptr(), - lwe_array_in.as_c_ptr(), + lwe_array_out.as_mut_c_ptr(0), + lwe_array_out.as_c_ptr(0), + lwe_array_in.as_c_ptr(0), lwe_dimension.0 as u32, num_samples, ); @@ -386,9 +408,9 @@ pub unsafe fn add_lwe_ciphertext_vector_plaintext_vector_async( cuda_negate_lwe_ciphertext_vector_64( streams.ptr[0], streams.gpu_indexes[0], - lwe_array_out.as_mut_c_ptr(), - lwe_array_in.as_c_ptr(), + lwe_array_out.as_mut_c_ptr(0), + lwe_array_in.as_c_ptr(0), lwe_dimension.0 as u32, num_samples, ); @@ -456,8 +478,8 @@ pub unsafe fn negate_lwe_ciphertext_vector_assign_async( cuda_negate_lwe_ciphertext_vector_64( streams.ptr[0], streams.gpu_indexes[0], - lwe_array_out.as_mut_c_ptr(), - lwe_array_out.as_c_ptr(), + lwe_array_out.as_mut_c_ptr(0), + lwe_array_out.as_c_ptr(0), lwe_dimension.0 as u32, num_samples, ); @@ -482,7 +504,7 @@ pub unsafe fn negate_integer_radix_assign_async( streams.ptr.as_ptr(), streams.gpu_indexes.as_ptr(), streams.len() as u32, - lwe_array.as_mut_c_ptr(), + lwe_array.as_mut_c_ptr(0), lwe_dimension.0 as u32, num_samples, message_modulus, @@ -506,9 +528,9 @@ pub unsafe fn mult_lwe_ciphertext_vector_cleartext_vector_assign_async( cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( streams.ptr[0], streams.gpu_indexes[0], - lwe_array_out.as_mut_c_ptr(), - lwe_array_in.as_c_ptr(), - cleartext_array_in.as_c_ptr(), + lwe_array_out.as_mut_c_ptr(0), + lwe_array_in.as_c_ptr(0), + cleartext_array_in.as_c_ptr(0), lwe_dimension.0 as u32, num_samples, ); @@ -607,10 +629,10 @@ mod tests { let vec = vec![1_u64, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; let stream = CudaStreams::new_single_gpu(0); unsafe { - let mut d_vec: CudaVec = CudaVec::::new_async(vec.len(), &stream); - d_vec.copy_from_cpu_async(&vec, &stream); + let mut d_vec: CudaVec = CudaVec::::new_async(vec.len(), &stream, 0); + d_vec.copy_from_cpu_async(&vec, &stream, 0); let mut empty = vec![0_u64; vec.len()]; - d_vec.copy_to_cpu_async(&mut empty, &stream); + d_vec.copy_to_cpu_async(&mut empty, &stream, 0); stream.synchronize(); assert_eq!(vec, empty); } diff --git a/tfhe/src/core_crypto/gpu/slice.rs b/tfhe/src/core_crypto/gpu/slice.rs index 25300133f..9076cf55c 100644 --- a/tfhe/src/core_crypto/gpu/slice.rs +++ b/tfhe/src/core_crypto/gpu/slice.rs @@ -1,24 +1,24 @@ use crate::core_crypto::gpu::vec::range_bounds_to_start_end; -use crate::core_crypto::gpu::{CudaPtr, CudaPtrMut, CudaStreams}; +use crate::core_crypto::gpu::CudaStreams; use crate::core_crypto::prelude::Numeric; use std::ffi::c_void; use std::marker::PhantomData; use tfhe_cuda_backend::cuda_bind::{cuda_memcpy_async_gpu_to_gpu, cuda_memcpy_async_to_cpu}; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Clone)] pub struct CudaSlice<'a, T: Numeric> { - ptr: CudaPtr, - _len: usize, - gpu_index: u32, + ptrs: Vec<*const c_void>, + _lengths: Vec, + gpu_indexes: Vec, _phantom_1: PhantomData, _phantom_2: PhantomData<&'a ()>, } #[derive(Debug)] pub struct CudaSliceMut<'a, T: Numeric> { - ptr: CudaPtrMut, - len: usize, - gpu_index: u32, + ptrs: Vec<*mut c_void>, + lengths: Vec, + gpu_indexes: Vec, _phantom_1: PhantomData, _phantom_2: PhantomData<&'a mut ()>, } @@ -33,9 +33,9 @@ where /// the cuda side. pub(crate) unsafe fn new(ptr: *const c_void, len: usize, gpu_index: u32) -> Self { Self { - ptr: CudaPtr(ptr), - _len: len, - gpu_index, + ptrs: vec![ptr; 1], + _lengths: vec![len; 1], + gpu_indexes: vec![gpu_index; 1], _phantom_1: PhantomData, _phantom_2: PhantomData, } @@ -45,11 +45,11 @@ where /// /// The caller must ensure that the slice outlives the pointer this function returns, /// or else it will end up pointing to garbage. - pub(crate) unsafe fn as_c_ptr(&self) -> *const c_void { - self.ptr.0 + pub(crate) unsafe fn as_c_ptr(&self, gpu_index: u32) -> *const c_void { + self.ptrs[gpu_index as usize] } - pub(crate) fn gpu_index(&self) -> u32 { - self.gpu_index + pub(crate) fn gpu_index(&self, index: u32) -> u32 { + self.gpu_indexes[index as usize] } } @@ -63,9 +63,9 @@ where /// the cuda side. pub(crate) unsafe fn new(ptr: *mut c_void, len: usize, gpu_index: u32) -> Self { Self { - ptr: CudaPtrMut(ptr), - len, - gpu_index, + ptrs: vec![ptr; 1], + lengths: vec![len; 1], + gpu_indexes: vec![gpu_index; 1], _phantom_1: PhantomData, _phantom_2: PhantomData, } @@ -75,16 +75,16 @@ where /// /// The caller must ensure that the slice outlives the pointer this function returns, /// or else it will end up pointing to garbage. - pub(crate) unsafe fn as_mut_c_ptr(&mut self) -> *mut c_void { - self.ptr.0 + pub(crate) unsafe fn as_mut_c_ptr(&mut self, gpu_index: u32) -> *mut c_void { + self.ptrs[gpu_index as usize] } /// # Safety /// /// The caller must ensure that the slice outlives the pointer this function returns, /// or else it will end up pointing to garbage. - pub(crate) unsafe fn as_c_ptr(&self) -> *const c_void { - self.ptr.0.cast_const() + pub(crate) unsafe fn as_c_ptr(&self, gpu_index: u32) -> *const c_void { + self.ptrs[gpu_index as usize].cast_const() } /// Copies data between two `CudaSlice` @@ -93,22 +93,20 @@ where /// /// - [CudaStreams::synchronize] __must__ be called after the copy /// as soon as synchronization is required. - pub unsafe fn copy_from_gpu_async(&mut self, src: &Self, streams: &CudaStreams) + pub unsafe fn copy_from_gpu_async(&mut self, src: &Self, streams: &CudaStreams, gpu_index: u32) where T: Numeric, { - assert_eq!(self.len(), src.len()); - assert_eq!(self.gpu_index, streams.gpu_indexes[0]); - assert_eq!(src.gpu_index, streams.gpu_indexes[0]); - let size = src.len() * std::mem::size_of::(); + assert_eq!(self.len(gpu_index), src.len(gpu_index)); + let size = src.len(gpu_index) * std::mem::size_of::(); // We check that src is not empty to avoid invalid pointers if size > 0 { cuda_memcpy_async_gpu_to_gpu( - self.as_mut_c_ptr(), - src.as_c_ptr(), + self.as_mut_c_ptr(gpu_index), + src.as_c_ptr(gpu_index), size as u64, - streams.ptr[0], - streams.gpu_indexes[0], + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], ); } } @@ -120,50 +118,48 @@ where /// /// - [CudaStreams::synchronize] __must__ be called after the copy /// as soon as synchronization is required. - pub unsafe fn copy_to_cpu_async(&self, dest: &mut [T], streams: &CudaStreams) + pub unsafe fn copy_to_cpu_async(&self, dest: &mut [T], streams: &CudaStreams, gpu_index: u32) where T: Numeric, { - assert_eq!(self.len(), dest.len()); - assert_eq!(self.gpu_index, streams.gpu_indexes[0]); - let size = self.len() * std::mem::size_of::(); + assert_eq!(self.len(gpu_index), dest.len()); + let size = self.len(gpu_index) * std::mem::size_of::(); // We check that src is not empty to avoid invalid pointers if size > 0 { cuda_memcpy_async_to_cpu( dest.as_mut_ptr().cast::(), - self.as_c_ptr(), + self.as_c_ptr(gpu_index), size as u64, - streams.ptr[0], - streams.gpu_indexes[0], + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], ); } } /// Returns the number of elements in the vector, also referred to as its ‘length’. - pub fn len(&self) -> usize { - self.len + pub fn len(&self, gpu_index: u32) -> usize { + self.lengths[gpu_index as usize] } - /// Returns true if the CudaSliceMut is empty - pub fn is_empty(&self) -> bool { - self.len == 0 + /// Returns true if the ptr is empty + pub fn is_empty(&self, gpu_index: u32) -> bool { + self.lengths[gpu_index as usize] == 0 } - pub(crate) fn get_mut(&mut self, range: R) -> Option> + pub(crate) fn get_mut(&mut self, range: R, gpu_index: u32) -> Option> where R: std::ops::RangeBounds, T: Numeric, { - let (start, end) = range_bounds_to_start_end(self.len(), range).into_inner(); + let (start, end) = range_bounds_to_start_end(self.len(gpu_index), range).into_inner(); // Check the range is compatible with the vec - if end <= start || end > self.len - 1 { + if end <= start || end > self.lengths[gpu_index as usize] - 1 { None } else { // Shift ptr let shifted_ptr: *mut c_void = unsafe { - self.ptr - .0 + self.ptrs[gpu_index as usize] .cast::() .add(start * std::mem::size_of::()) .cast() @@ -173,37 +169,51 @@ where let new_len = end - start + 1; // Create the slice - Some(unsafe { CudaSliceMut::new(shifted_ptr, new_len, self.gpu_index) }) + Some(unsafe { + CudaSliceMut::new(shifted_ptr, new_len, self.gpu_indexes[gpu_index as usize]) + }) } } pub(crate) fn split_at_mut( &mut self, mid: usize, + gpu_index: u32, ) -> (Option>, Option>) where T: Numeric, { // Check the index is compatible with the vec - if mid > self.len - 1 { + if mid > self.lengths[gpu_index as usize] - 1 { (None, None) } else if mid == 0 { ( None, - Some(unsafe { CudaSliceMut::new(self.ptr.0, self.len, self.gpu_index) }), + Some(unsafe { + CudaSliceMut::new( + self.ptrs[gpu_index as usize], + self.lengths[gpu_index as usize], + gpu_index, + ) + }), ) - } else if mid == self.len - 1 { + } else if mid == self.lengths[gpu_index as usize] - 1 { ( - Some(unsafe { CudaSliceMut::new(self.ptr.0, self.len, self.gpu_index) }), + Some(unsafe { + CudaSliceMut::new( + self.ptrs[gpu_index as usize], + self.lengths[gpu_index as usize], + gpu_index, + ) + }), None, ) } else { let new_len_1 = mid; - let new_len_2 = self.len - mid; + let new_len_2 = self.lengths[gpu_index as usize] - mid; // Shift ptr let shifted_ptr: *mut c_void = unsafe { - self.ptr - .0 + self.ptrs[gpu_index as usize] .cast::() .add(mid * std::mem::size_of::()) .cast() @@ -211,12 +221,14 @@ where // Create the slice ( - Some(unsafe { CudaSliceMut::new(self.ptr.0, new_len_1, self.gpu_index) }), - Some(unsafe { CudaSliceMut::new(shifted_ptr, new_len_2, self.gpu_index) }), + Some(unsafe { + CudaSliceMut::new(self.ptrs[gpu_index as usize], new_len_1, gpu_index) + }), + Some(unsafe { CudaSliceMut::new(shifted_ptr, new_len_2, gpu_index) }), ) } } - pub(crate) fn gpu_index(&self) -> u32 { - self.gpu_index + pub(crate) fn gpu_index(&self, index: u32) -> u32 { + self.gpu_indexes[index as usize] } } diff --git a/tfhe/src/core_crypto/gpu/vec.rs b/tfhe/src/core_crypto/gpu/vec.rs index d6dd052a0..9a572c777 100644 --- a/tfhe/src/core_crypto/gpu/vec.rs +++ b/tfhe/src/core_crypto/gpu/vec.rs @@ -1,6 +1,7 @@ use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut}; -use crate::core_crypto::gpu::{synchronize_device, CudaPtrMut, CudaStreams}; +use crate::core_crypto::gpu::{synchronize_device, CudaStreams}; use crate::core_crypto::prelude::Numeric; +use rayon::prelude::*; use std::collections::Bound::{Excluded, Included, Unbounded}; use std::ffi::c_void; use std::marker::PhantomData; @@ -23,30 +24,76 @@ use tfhe_cuda_backend::cuda_bind::{ /// memory, it is pretty close to a `Vec`. That being said, it only present a very very limited api. #[derive(Debug)] pub struct CudaVec { - ptr: CudaPtrMut, - len: usize, - gpu_index: u32, + pub ptr: Vec<*mut c_void>, + pub len: usize, + pub gpu_indexes: Vec, _phantom: PhantomData, } impl CudaVec { - pub fn new(len: usize, streams: &CudaStreams) -> Self { - let vec = unsafe { Self::new_async(len, streams) }; + /// This creates a `CudaVec` that holds memory of `len` elements + /// on the GPU with index `gpu_index` + pub fn new(len: usize, streams: &CudaStreams, gpu_index: u32) -> Self { + let vec = unsafe { Self::new_async(len, streams, gpu_index) }; streams.synchronize(); vec } /// # Safety /// /// - `streams` __must__ be synchronized to guarantee computation has finished - pub unsafe fn new_async(len: usize, streams: &CudaStreams) -> Self { + pub unsafe fn new_async(len: usize, streams: &CudaStreams, gpu_index: u32) -> Self { let size = len as u64 * std::mem::size_of::() as u64; - let ptr = cuda_malloc_async(size, streams.ptr[0], streams.gpu_indexes[0]); - cuda_memset_async(ptr, 0u64, size, streams.ptr[0], streams.gpu_indexes[0]); + let ptr = cuda_malloc_async( + size, + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], + ); + cuda_memset_async( + ptr, + 0u64, + size, + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], + ); Self { - ptr: CudaPtrMut(ptr), + ptr: vec![ptr; 1], len, - gpu_index: streams.gpu_indexes[0], + gpu_indexes: vec![streams.gpu_indexes[gpu_index as usize]; 1], + _phantom: PhantomData, + } + } + + /// This creates a `CudaVec` that holds memory of + /// `len` elements on as many GPUs as there are `CudaStreams` + pub fn new_multi_gpu(len: usize, streams: &CudaStreams) -> Self { + let size = len as u64 * std::mem::size_of::() as u64; + let mut ptrs = Vec::with_capacity(streams.len()); + for &gpu_index in streams.gpu_indexes.iter() { + let ptr = unsafe { + cuda_malloc_async( + size, + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], + ) + }; + unsafe { + cuda_memset_async( + ptr, + 0u64, + size, + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], + ); + } + streams.synchronize_one(gpu_index); + ptrs.push(ptr); + } + + Self { + ptr: ptrs, + len, + gpu_indexes: streams.gpu_indexes.clone(), _phantom: PhantomData, } } @@ -55,11 +102,11 @@ impl CudaVec { /// /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must /// not be dropped until streams is synchronised - pub unsafe fn from_cpu_async(src: &[T], streams: &CudaStreams) -> Self { - let mut res = Self::new_async(src.len(), streams); + pub unsafe fn from_cpu_async(src: &[T], streams: &CudaStreams, gpu_index: u32) -> Self { + let mut res = Self::new(src.len(), streams, gpu_index); // We have to check that h_data is not empty, because cuda_memset with size 0 is invalid if !src.is_empty() { - res.copy_from_cpu_async(src, streams); + res.copy_from_cpu_async(src, streams, gpu_index); } res } @@ -68,7 +115,20 @@ impl CudaVec { /// /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must /// not be dropped until streams is synchronised - pub unsafe fn memset_async(&mut self, value: T, streams: &CudaStreams) + pub unsafe fn from_cpu_multi_gpu_async(src: &[T], streams: &CudaStreams) -> Self { + let mut res = Self::new_multi_gpu(src.len(), streams); + // We have to check that h_data is not empty, because cuda_memset with size 0 is invalid + if !src.is_empty() { + res.copy_from_cpu_multi_gpu_async(src, streams); + } + res + } + + /// # Safety + /// + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised + pub unsafe fn memset_async(&mut self, value: T, streams: &CudaStreams, gpu_index: u32) where T: Into, { @@ -76,22 +136,45 @@ impl CudaVec { // We check that self is not empty to avoid invalid pointers if size > 0 { cuda_memset_async( - self.as_mut_c_ptr(), + self.as_mut_c_ptr(gpu_index), value.into(), size as u64, - streams.ptr[0], - streams.gpu_indexes[0], + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], ); } } + /// # Safety + /// + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised + pub unsafe fn memset_multi_gpu_async(&mut self, value: T, streams: &CudaStreams) + where + T: Into, + { + for &gpu_index in self.gpu_indexes.clone().iter() { + let size = self.len() * std::mem::size_of::(); + // We check that self is not empty to avoid invalid pointers + if size > 0 { + cuda_memset_async( + self.as_mut_c_ptr(gpu_index), + value.into(), + size as u64, + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], + ); + } + } + } + /// Copies data from slice into `CudaVec` /// /// # Safety /// /// - [CudaStreams::synchronize] __must__ be called after the copy /// as soon as synchronization is required - pub unsafe fn copy_from_cpu_async(&mut self, src: &[T], streams: &CudaStreams) + pub unsafe fn copy_from_cpu_async(&mut self, src: &[T], streams: &CudaStreams, gpu_index: u32) where T: Numeric, { @@ -102,22 +185,50 @@ impl CudaVec { // invalid pointer being passed to copy_to_gpu_async if size > 0 { cuda_memcpy_async_to_gpu( - self.as_mut_c_ptr(), + self.as_mut_c_ptr(gpu_index), src.as_ptr().cast(), size as u64, - streams.ptr[0], - streams.gpu_indexes[0], + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], ); } } + /// Copies data from slice into `CudaVec` + /// + /// # Safety + /// + /// - [CudaStreams::synchronize] __must__ be called after the copy + /// as soon as synchronization is required + pub unsafe fn copy_from_cpu_multi_gpu_async(&mut self, src: &[T], streams: &CudaStreams) + where + T: Numeric, + { + self.gpu_indexes.par_iter().for_each(|&gpu_index| { + assert!(self.len() >= src.len()); + let size = std::mem::size_of_val(src); + + // We have to check that src is not empty, because Rust slice with size 0 results in an + // invalid pointer being passed to copy_to_gpu_async + if size > 0 { + cuda_memcpy_async_to_gpu( + self.get_mut_c_ptr(gpu_index), + src.as_ptr().cast(), + size as u64, + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], + ); + } + }); + } + /// Copies data between two `CudaVec` /// /// # Safety /// /// - [CudaStreams::synchronize] __must__ be called after the copy /// as soon as synchronization is required - pub unsafe fn copy_from_gpu_async(&mut self, src: &Self, streams: &CudaStreams) + pub unsafe fn copy_from_gpu_async(&mut self, src: &Self, streams: &CudaStreams, gpu_index: u32) where T: Numeric, { @@ -126,11 +237,11 @@ impl CudaVec { // We check that src is not empty to avoid invalid pointers if size > 0 { cuda_memcpy_async_gpu_to_gpu( - self.as_mut_c_ptr(), - src.as_c_ptr(), + self.as_mut_c_ptr(gpu_index), + src.as_c_ptr(gpu_index), size as u64, - streams.ptr[0], - streams.gpu_indexes[0], + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], ); } } @@ -146,6 +257,7 @@ impl CudaVec { range: R, src: &Self, streams: &CudaStreams, + gpu_index: u32, ) where R: std::ops::RangeBounds, T: Numeric, @@ -158,14 +270,16 @@ impl CudaVec { assert!(end < src.len()); assert!(end - start < self.len()); - let src_ptr = src.as_c_ptr().add(start * std::mem::size_of::()); + let src_ptr = src + .as_c_ptr(gpu_index) + .add(start * std::mem::size_of::()); let size = (end - start + 1) * std::mem::size_of::(); cuda_memcpy_async_gpu_to_gpu( - self.as_mut_c_ptr(), + self.as_mut_c_ptr(gpu_index), src_ptr, size as u64, - streams.ptr[0], - streams.gpu_indexes[0], + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], ); } @@ -180,6 +294,7 @@ impl CudaVec { range: R, src: &Self, streams: &CudaStreams, + gpu_index: u32, ) where R: std::ops::RangeBounds, T: Numeric, @@ -192,24 +307,26 @@ impl CudaVec { assert!(end < self.len()); assert!(end - start < src.len()); - let dest_ptr = self.as_mut_c_ptr().add(start * std::mem::size_of::()); + let dest_ptr = self + .as_mut_c_ptr(gpu_index) + .add(start * std::mem::size_of::()); let size = (end - start + 1) * std::mem::size_of::(); cuda_memcpy_async_gpu_to_gpu( dest_ptr, - src.as_c_ptr(), + src.as_c_ptr(gpu_index), size as u64, - streams.ptr[0], - streams.gpu_indexes[0], + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], ); } - /// Copies data from `CudaVec` into slice + /// Copies data from `CudaVec` into slice on a specific GPU /// /// # Safety /// /// - [CudaStreams::synchronize] __must__ be called as soon as synchronization is /// required - pub unsafe fn copy_to_cpu_async(&self, dest: &mut [T], streams: &CudaStreams) + pub unsafe fn copy_to_cpu_async(&self, dest: &mut [T], streams: &CudaStreams, gpu_index: u32) where T: Numeric, { @@ -221,23 +338,27 @@ impl CudaVec { if size > 0 { cuda_memcpy_async_to_cpu( dest.as_mut_ptr().cast(), - self.as_c_ptr(), + self.as_c_ptr(gpu_index), size as u64, - streams.ptr[0], - streams.gpu_indexes[0], + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], ); } } - pub(crate) fn as_mut_c_ptr(&mut self) -> *mut c_void { - self.ptr.0 + pub(crate) fn as_mut_c_ptr(&mut self, gpu_index: u32) -> *mut c_void { + self.ptr[gpu_index as usize] } - pub(crate) fn as_c_ptr(&self) -> *const c_void { - self.ptr.0.cast_const() + pub(crate) fn get_mut_c_ptr(&self, gpu_index: u32) -> *mut c_void { + self.ptr[gpu_index as usize] } - pub(crate) fn as_slice(&self, range: R) -> Option> + pub(crate) fn as_c_ptr(&self, gpu_index: u32) -> *const c_void { + self.ptr[gpu_index as usize].cast_const() + } + + pub(crate) fn as_slice(&self, range: R, gpu_index: u32) -> Option> where R: std::ops::RangeBounds, T: Numeric, @@ -250,8 +371,7 @@ impl CudaVec { } else { // Shift ptr let shifted_ptr: *mut c_void = unsafe { - self.ptr - .0 + self.ptr[gpu_index as usize] .cast::() .add(start * std::mem::size_of::()) .cast() @@ -261,11 +381,11 @@ impl CudaVec { let new_len = end - start + 1; // Create the slice - Some(unsafe { CudaSlice::new(shifted_ptr, new_len, self.gpu_index) }) + Some(unsafe { CudaSlice::new(shifted_ptr, new_len, gpu_index) }) } } - pub(crate) fn as_mut_slice(&mut self, range: R) -> Option> + pub(crate) fn as_mut_slice(&mut self, range: R, gpu_index: u32) -> Option> where R: std::ops::RangeBounds, T: Numeric, @@ -278,8 +398,7 @@ impl CudaVec { } else { // Shift ptr let shifted_ptr: *mut c_void = unsafe { - self.ptr - .0 + self.ptr[gpu_index as usize] .cast::() .add(start * std::mem::size_of::()) .cast() @@ -289,20 +408,22 @@ impl CudaVec { let new_len = end - start + 1; // Create the slice - Some(unsafe { CudaSliceMut::new(shifted_ptr, new_len, self.gpu_index) }) + Some(unsafe { CudaSliceMut::new(shifted_ptr, new_len, gpu_index) }) } } - pub fn gpu_index(&self) -> u32 { - self.gpu_index + /// Returns the GPU index at index + pub fn gpu_index(&self, index: u32) -> u32 { + self.gpu_indexes[index as usize] } - /// Returns the number of elements in the vector, also referred to as its ‘length’. + /// Returns the number of elements in the vector, also referred to as its ‘length’, + /// on every GPU pub fn len(&self) -> usize { self.len } - /// Returns `true` if the CudaVec contains no elements. + /// Returns `true` if the CudaVec contains no elements on every GPU. pub fn is_empty(&self) -> bool { self.len == 0 } @@ -324,9 +445,11 @@ unsafe impl Sync for CudaVec where T: Sync + Numeric {} impl Drop for CudaVec { /// Free memory for pointer `ptr` synchronously fn drop(&mut self) { - // Synchronizes the device to be sure no stream is still using this pointer - synchronize_device(self.gpu_index); - unsafe { cuda_drop(self.as_mut_c_ptr(), self.gpu_index) }; + self.gpu_indexes.par_iter().for_each(|&gpu_index| { + // Synchronizes the device to be sure no stream is still using this pointer + synchronize_device(gpu_index); + unsafe { cuda_drop(self.get_mut_c_ptr(gpu_index), gpu_index) }; + }); } } diff --git a/tfhe/src/integer/gpu/ciphertext/boolean_value.rs b/tfhe/src/integer/gpu/ciphertext/boolean_value.rs index ecfac6c07..4d76b0a60 100644 --- a/tfhe/src/integer/gpu/ciphertext/boolean_value.rs +++ b/tfhe/src/integer/gpu/ciphertext/boolean_value.rs @@ -82,12 +82,11 @@ impl CudaBooleanBlock { pub fn copy_from_boolean_block(&mut self, boolean_block: &BooleanBlock, streams: &CudaStreams) { unsafe { - self.0 - .ciphertext - .d_blocks - .0 - .d_vec - .copy_from_cpu_async(boolean_block.0.ct.as_ref(), streams); + self.0.ciphertext.d_blocks.0.d_vec.copy_from_cpu_async( + boolean_block.0.ct.as_ref(), + streams, + 0, + ); } streams.synchronize(); @@ -154,8 +153,8 @@ impl CudaBooleanBlock { let lwe_ciphertext_count = self.0.ciphertext.d_blocks.lwe_ciphertext_count(); let ciphertext_modulus = self.0.ciphertext.d_blocks.ciphertext_modulus(); - let mut d_ct = CudaVec::new_async(self.0.ciphertext.d_blocks.0.d_vec.len(), streams); - d_ct.copy_from_gpu_async(&self.0.ciphertext.d_blocks.0.d_vec, streams); + let mut d_ct = CudaVec::new_async(self.0.ciphertext.d_blocks.0.d_vec.len(), streams, 0); + d_ct.copy_from_gpu_async(&self.0.ciphertext.d_blocks.0.d_vec, streams, 0); let d_blocks = CudaLweCiphertextList::from_cuda_vec(d_ct, lwe_ciphertext_count, ciphertext_modulus); diff --git a/tfhe/src/integer/gpu/ciphertext/mod.rs b/tfhe/src/integer/gpu/ciphertext/mod.rs index 705e0ef59..7f0408edd 100644 --- a/tfhe/src/integer/gpu/ciphertext/mod.rs +++ b/tfhe/src/integer/gpu/ciphertext/mod.rs @@ -172,11 +172,11 @@ impl CudaUnsignedRadixCiphertext { .collect::>(); unsafe { - self.ciphertext - .d_blocks - .0 - .d_vec - .copy_from_cpu_async(h_radix_ciphertext.as_mut_slice(), streams); + self.ciphertext.d_blocks.0.d_vec.copy_from_cpu_async( + h_radix_ciphertext.as_mut_slice(), + streams, + 0, + ); } streams.synchronize(); @@ -325,11 +325,11 @@ impl CudaSignedRadixCiphertext { .collect::>(); unsafe { - self.ciphertext - .d_blocks - .0 - .d_vec - .copy_from_cpu_async(h_radix_ciphertext.as_mut_slice(), streams); + self.ciphertext.d_blocks.0.d_vec.copy_from_cpu_async( + h_radix_ciphertext.as_mut_slice(), + streams, + 0, + ); } streams.synchronize(); @@ -436,8 +436,8 @@ impl CudaRadixCiphertext { let lwe_ciphertext_count = self.d_blocks.lwe_ciphertext_count(); let ciphertext_modulus = self.d_blocks.ciphertext_modulus(); - let mut d_ct = CudaVec::new_async(self.d_blocks.0.d_vec.len(), streams); - d_ct.copy_from_gpu_async(&self.d_blocks.0.d_vec, streams); + let mut d_ct = CudaVec::new_async(self.d_blocks.0.d_vec.len(), streams, 0); + d_ct.copy_from_gpu_async(&self.d_blocks.0.d_vec, streams, 0); let d_blocks = CudaLweCiphertextList::from_cuda_vec(d_ct, lwe_ciphertext_count, ciphertext_modulus); @@ -458,12 +458,12 @@ impl CudaRadixCiphertext { self.d_blocks .0 .d_vec - .copy_to_cpu_async(self_container.as_mut_slice(), streams); + .copy_to_cpu_async(self_container.as_mut_slice(), streams, 0); other .d_blocks .0 .d_vec - .copy_to_cpu_async(other_container.as_mut_slice(), streams); + .copy_to_cpu_async(other_container.as_mut_slice(), streams, 0); } streams.synchronize(); diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 28a00fb55..5c28f8644 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -166,20 +166,20 @@ pub unsafe fn scalar_addition_integer_radix_assign_async( ) { assert_eq!( streams.gpu_indexes[0], - lwe_array.gpu_index(), + lwe_array.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( streams.gpu_indexes[0], - scalar_input.gpu_index(), + scalar_input.gpu_index(0), "GPU error: all data should reside on the same GPU." ); cuda_scalar_addition_integer_radix_ciphertext_64_inplace( streams.ptr.as_ptr(), streams.gpu_indexes.as_ptr(), streams.len() as u32, - lwe_array.as_mut_c_ptr(), - scalar_input.as_c_ptr(), + lwe_array.as_mut_c_ptr(0), + scalar_input.as_c_ptr(0), lwe_dimension.0 as u32, num_samples, message_modulus, @@ -215,23 +215,24 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async(), has_at_least_one_set.as_ptr().cast::(), mem_ptr, - bootstrapping_key.as_c_ptr(), - keyswitch_key.as_c_ptr(), + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), (glwe_dimension.0 * polynomial_size.0) as u32, polynomial_size.0 as u32, message_modulus.0 as u32, @@ -266,8 +267,9 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async( ) { assert_eq!( streams.gpu_indexes[0], - radix_lwe_left.gpu_index(), + radix_lwe_left.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( streams.gpu_indexes[0], - radix_lwe_right.gpu_index(), + radix_lwe_right.gpu_index(0), "GPU error: all data should reside on the same GPU." ); cuda_add_lwe_ciphertext_vector_64( streams.ptr[0], streams.gpu_indexes[0], - radix_lwe_left.as_mut_c_ptr(), - radix_lwe_left.as_c_ptr(), - radix_lwe_right.as_c_ptr(), + radix_lwe_left.as_mut_c_ptr(0), + radix_lwe_left.as_c_ptr(0), + radix_lwe_right.as_c_ptr(0), lwe_dimension.0 as u32, num_blocks, ); @@ -331,28 +333,29 @@ pub unsafe fn unchecked_mul_integer_radix_kb_assign_async( ) { assert_eq!( streams.gpu_indexes[0], - radix_lwe_input.gpu_index(), + radix_lwe_input.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( streams.gpu_indexes[0], - bootstrapping_key.gpu_index(), + bootstrapping_key.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( streams.gpu_indexes[0], - keyswitch_key.gpu_index(), + keyswitch_key.gpu_index(0), "GPU error: all data should reside on the same GPU." ); let mut mem_ptr: *mut i8 = std::ptr::null_mut(); @@ -896,10 +910,10 @@ pub unsafe fn full_propagate_assign_async( streams.ptr.as_ptr(), streams.gpu_indexes.as_ptr(), streams.len() as u32, - radix_lwe_input.as_mut_c_ptr(), + radix_lwe_input.as_mut_c_ptr(0), mem_ptr, - keyswitch_key.as_c_ptr(), - bootstrapping_key.as_c_ptr(), + keyswitch_key.ptr.as_ptr(), + bootstrapping_key.ptr.as_ptr(), lwe_dimension.0 as u32, glwe_dimension.0 as u32, polynomial_size.0 as u32, @@ -943,24 +957,25 @@ pub unsafe fn propagate_single_carry_assign_async( ) { assert_eq!( streams.gpu_indexes[0], - radix_lwe_input.gpu_index(), + radix_lwe_input.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( streams.gpu_indexes[0], - radix_lwe_output.gpu_index(), + radix_lwe_output.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( streams.gpu_indexes[0], - bootstrapping_key.gpu_index(), + bootstrapping_key.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( streams.gpu_indexes[0], - keyswitch_key.gpu_index(), + keyswitch_key.gpu_index(0), "GPU error: all data should reside on the same GPU." ); let mut mem_ptr: *mut i8 = std::ptr::null_mut(); scratch_cuda_apply_univariate_lut_kb_64( - streams.ptr[0], - streams.gpu_indexes[0], + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, std::ptr::addr_of_mut!(mem_ptr), input_lut.as_ptr().cast(), lwe_dimension.0 as u32, @@ -2111,16 +2152,17 @@ pub unsafe fn apply_univariate_lut_kb_async( streams.ptr.as_ptr(), streams.gpu_indexes.as_ptr(), streams.len() as u32, - radix_lwe_output.as_mut_c_ptr(), - radix_lwe_input.as_c_ptr(), + radix_lwe_output.as_mut_c_ptr(0), + radix_lwe_input.as_c_ptr(0), mem_ptr, - keyswitch_key.as_c_ptr(), - bootstrapping_key.as_c_ptr(), + keyswitch_key.ptr.as_ptr(), + bootstrapping_key.ptr.as_ptr(), num_blocks, ); cleanup_cuda_apply_univariate_lut_kb_64( - streams.ptr[0], - streams.gpu_indexes[0], + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, std::ptr::addr_of_mut!(mem_ptr), ); } @@ -2157,8 +2199,9 @@ pub unsafe fn unchecked_unsigned_div_rem_integer_radix_kb_assign_async< ) { let mut mem_ptr: *mut i8 = std::ptr::null_mut(); scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( - streams.ptr[0], - streams.gpu_indexes[0], + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, std::ptr::addr_of_mut!(mem_ptr), glwe_dimension.0 as u32, polynomial_size.0 as u32, @@ -2179,18 +2222,19 @@ pub unsafe fn unchecked_unsigned_div_rem_integer_radix_kb_assign_async< streams.ptr.as_ptr(), streams.gpu_indexes.as_ptr(), streams.len() as u32, - quotient.as_mut_c_ptr(), - remainder.as_mut_c_ptr(), - numerator.as_c_ptr(), - divisor.as_c_ptr(), + quotient.as_mut_c_ptr(0), + remainder.as_mut_c_ptr(0), + numerator.as_c_ptr(0), + divisor.as_c_ptr(0), mem_ptr, - bootstrapping_key.as_c_ptr(), - keyswitch_key.as_c_ptr(), + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), num_blocks, ); cleanup_cuda_integer_div_rem( - streams.ptr[0], - streams.gpu_indexes[0], + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, std::ptr::addr_of_mut!(mem_ptr), ); } diff --git a/tfhe/src/integer/gpu/server_key/radix/add.rs b/tfhe/src/integer/gpu/server_key/radix/add.rs index a1245d142..01f88aa0a 100644 --- a/tfhe/src/integer/gpu/server_key/radix/add.rs +++ b/tfhe/src/integer/gpu/server_key/radix/add.rs @@ -232,12 +232,11 @@ impl CudaServerKey { return; } - result - .as_mut() - .d_blocks - .0 - .d_vec - .copy_from_gpu_async(&ciphertexts[0].as_ref().d_blocks.0.d_vec, streams); + result.as_mut().d_blocks.0.d_vec.copy_from_gpu_async( + &ciphertexts[0].as_ref().d_blocks.0.d_vec, + streams, + 0, + ); if ciphertexts.len() == 1 { return; } diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs index 6eb8341db..e078bd545 100644 --- a/tfhe/src/integer/gpu/server_key/radix/mod.rs +++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs @@ -337,13 +337,14 @@ impl CudaServerKey { let shift = num_blocks * lwe_size.0; let mut extended_ct_vec = - unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream) }; + unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) }; unsafe { - extended_ct_vec.memset_async(0u64, stream); + extended_ct_vec.memset_async(0u64, stream, 0); extended_ct_vec.copy_self_range_gpu_to_gpu_async( shift.., &ct.as_ref().d_blocks.0.d_vec, stream, + 0, ); } stream.synchronize(); @@ -406,10 +407,10 @@ impl CudaServerKey { let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size(); let mut extended_ct_vec = - unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream) }; + unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) }; unsafe { - extended_ct_vec.memset_async(0u64, stream); - extended_ct_vec.copy_from_gpu_async(&ct.as_ref().d_blocks.0.d_vec, stream); + extended_ct_vec.memset_async(0u64, stream, 0); + extended_ct_vec.copy_from_gpu_async(&ct.as_ref().d_blocks.0.d_vec, stream, 0); } stream.synchronize(); let extended_ct_list = CudaLweCiphertextList::from_cuda_vec( @@ -471,12 +472,14 @@ impl CudaServerKey { let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size(); let shift = num_blocks * lwe_size.0; - let mut trimmed_ct_vec = unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream) }; + let mut trimmed_ct_vec = + unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) }; unsafe { trimmed_ct_vec.copy_src_range_gpu_to_gpu_async( shift.., &ct.as_ref().d_blocks.0.d_vec, stream, + 0, ); } stream.synchronize(); @@ -536,12 +539,14 @@ impl CudaServerKey { let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size(); let shift = new_num_blocks * lwe_size.0; - let mut trimmed_ct_vec = unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream) }; + let mut trimmed_ct_vec = + unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) }; unsafe { trimmed_ct_vec.copy_src_range_gpu_to_gpu_async( 0..shift, &ct.as_ref().d_blocks.0.d_vec, stream, + 0, ); } stream.synchronize(); @@ -609,21 +614,21 @@ impl CudaServerKey { let lwe_size = ct.as_ref().d_blocks.0.lwe_dimension.to_lwe_size().0; // Allocate the necessary amount of memory - let mut output_radix = CudaVec::new(new_num_ct_blocks * lwe_size, stream); + let mut output_radix = CudaVec::new(new_num_ct_blocks * lwe_size, stream, 0); unsafe { - output_radix.copy_from_gpu_async(&ct.as_ref().d_blocks.0.d_vec, stream); + output_radix.copy_from_gpu_async(&ct.as_ref().d_blocks.0.d_vec, stream, 0); // Get the last ct block let last_block = ct .as_ref() .d_blocks .0 .d_vec - .as_slice(lwe_size * (num_ct_blocks - 1)..) + .as_slice(lwe_size * (num_ct_blocks - 1).., 0) .unwrap(); let mut output_slice = output_radix - .as_mut_slice(lwe_size * num_ct_blocks..lwe_size * new_num_ct_blocks) + .as_mut_slice(lwe_size * num_ct_blocks..lwe_size * new_num_ct_blocks, 0) .unwrap(); - let (padding_block, new_blocks) = output_slice.split_at_mut(lwe_size); + let (padding_block, new_blocks) = output_slice.split_at_mut(lwe_size, 0); let mut padding_block = padding_block.unwrap(); let mut new_blocks = new_blocks.unwrap(); @@ -679,9 +684,9 @@ impl CudaServerKey { } for i in 0..num_blocks - 1 { let mut output_block = new_blocks - .get_mut(lwe_size * i..lwe_size * (i + 1)) + .get_mut(lwe_size * i..lwe_size * (i + 1), 0) .unwrap(); - output_block.copy_from_gpu_async(&padding_block, stream); + output_block.copy_from_gpu_async(&padding_block, stream, 0); } } stream.synchronize(); diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs index 1ff3e658f..cac166a53 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs @@ -73,14 +73,17 @@ impl CudaServerKey { { if scalar != Scalar::ZERO { let bits_in_message = self.message_modulus.0.ilog2(); - let mut d_decomposed_scalar = - CudaVec::::new_async(ct.as_ref().d_blocks.lwe_ciphertext_count().0, streams); + let mut d_decomposed_scalar = CudaVec::::new_async( + ct.as_ref().d_blocks.lwe_ciphertext_count().0, + streams, + 0, + ); let decomposed_scalar = BlockDecomposer::with_early_stop_at_zero(scalar, bits_in_message) .iter_as::() .take(d_decomposed_scalar.len()) .collect::>(); - d_decomposed_scalar.copy_from_cpu_async(decomposed_scalar.as_slice(), streams); + d_decomposed_scalar.copy_from_cpu_async(decomposed_scalar.as_slice(), streams, 0); let lwe_dimension = ct.as_ref().d_blocks.lwe_dimension(); // If the scalar is decomposed using less than the number of blocks our ciphertext diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs index 5e50415f5..4211f860d 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs @@ -31,7 +31,7 @@ impl CudaServerKey { .map(|x| x as u64) .collect::>(); - let clear_blocks = CudaVec::from_cpu_async(&h_clear_blocks, stream); + let clear_blocks = CudaVec::from_cpu_async(&h_clear_blocks, stream, 0); match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs index 7cdcb7fbb..e3cc0a5c7 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs @@ -153,7 +153,7 @@ impl CudaServerKey { // as we will handle them separately. scalar_blocks.truncate(ct.as_ref().d_blocks.lwe_ciphertext_count().0); - let d_scalar_blocks: CudaVec = CudaVec::from_cpu_async(&scalar_blocks, stream); + let d_scalar_blocks: CudaVec = CudaVec::from_cpu_async(&scalar_blocks, stream, 0); let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count(); @@ -327,7 +327,7 @@ impl CudaServerKey { .iter_as::() .collect::>(); - let d_scalar_blocks: CudaVec = CudaVec::from_cpu_async(&scalar_blocks, stream); + let d_scalar_blocks: CudaVec = CudaVec::from_cpu_async(&scalar_blocks, stream, 0); let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count(); diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs index 850c6d049..9bab89c00 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs @@ -69,7 +69,7 @@ impl CudaServerKey { T: CudaIntegerRadixCiphertext, { if scalar == Scalar::ZERO { - ct.as_mut().d_blocks.0.d_vec.memset_async(0, stream); + ct.as_mut().d_blocks.0.d_vec.memset_async(0, stream, 0); return; }