mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-08 22:28:01 -05:00
feat(gpu): add tfhe-cuda-backend to the repository
This commit is contained in:
97
.github/workflows/aws_tfhe_gpu_tests.yml
vendored
97
.github/workflows/aws_tfhe_gpu_tests.yml
vendored
@@ -1,14 +1,101 @@
|
||||
# Compile and test Concrete-cuda on an AWS instance
|
||||
name: Concrete Cuda - Full tests
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: 'Slab request ID'
|
||||
type: string
|
||||
fork_repo:
|
||||
description: 'Name of forked repo as user/repo'
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: 'Git SHA to checkout from fork'
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
placeholder:
|
||||
name: Placeholder
|
||||
runs-on: ubuntu-latest
|
||||
run-cuda-tests-linux:
|
||||
concurrency:
|
||||
group: tfhe_cuda_backend_test-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
name: Test code in EC2
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
steps:
|
||||
- run: |
|
||||
echo "Hello this is a Placeholder for GPU Workflow"
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "ID: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
echo "Fork repo: ${{ inputs.fork_repo }}"
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: stable
|
||||
default: true
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run all tests
|
||||
run: |
|
||||
make clippy_gpu
|
||||
make test_gpu
|
||||
|
||||
157
.github/workflows/integer_gpu_benchmark.yml
vendored
Normal file
157
.github/workflows/integer_gpu_benchmark.yml
vendored
Normal file
@@ -0,0 +1,157 @@
|
||||
# Run integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Integer GPU benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
name: Execute integer benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make AVX512_SUPPORT=ON FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--backend gpu \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
154
.github/workflows/integer_gpu_full_benchmark.yml
vendored
Normal file
154
.github/workflows/integer_gpu_full_benchmark.yml
vendored
Normal file
@@ -0,0 +1,154 @@
|
||||
# Run all integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Integer GPU full benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
|
||||
jobs:
|
||||
integer-benchmarks:
|
||||
name: Execute integer benchmarks for all operations flavor
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
continue-on-error: true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [ integer, integer_multi_bit]
|
||||
op_flavor: [ default, unchecked ]
|
||||
# explicit include-based build matrix, of known valid options
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
slack-notification:
|
||||
name: Slack Notification
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ failure() }}
|
||||
needs: integer-benchmarks
|
||||
steps:
|
||||
- name: Notify
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Integer GPU full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
158
.github/workflows/integer_multi_bit_gpu_benchmark.yml
vendored
Normal file
158
.github/workflows/integer_multi_bit_gpu_benchmark.yml
vendored
Normal file
@@ -0,0 +1,158 @@
|
||||
# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Integer Multi-bit benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
name: Execute integer multi-bit benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "11.8"
|
||||
cuda_arch: "70"
|
||||
gcc: 9
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make AVX512_SUPPORT=ON FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--backend gpu \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
1
.github/workflows/start_benchmarks.yml
vendored
1
.github/workflows/start_benchmarks.yml
vendored
@@ -49,6 +49,7 @@ jobs:
|
||||
command: [ boolean_bench, shortint_bench,
|
||||
integer_bench, integer_multi_bit_bench,
|
||||
signed_integer_bench, signed_integer_multi_bit_bench,
|
||||
integer_gpu_bench, integer_multi_bit_gpu_bench,
|
||||
pbs_bench, wasm_client_bench ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
|
||||
4
.github/workflows/start_full_benchmarks.yml
vendored
4
.github/workflows/start_full_benchmarks.yml
vendored
@@ -24,8 +24,8 @@ jobs:
|
||||
if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
|
||||
strategy:
|
||||
matrix:
|
||||
command: [ boolean_bench, shortint_full_bench, integer_full_bench,
|
||||
signed_integer_full_bench, pbs_bench, wasm_client_bench ]
|
||||
command: [ boolean_bench, shortint_full_bench, integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
|
||||
pbs_bench, wasm_client_bench ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
|
||||
@@ -29,6 +29,7 @@ jobs:
|
||||
allow-repeats: true
|
||||
message: |
|
||||
@slab-ci cpu_fast_test
|
||||
@slab-ci gpu_test
|
||||
|
||||
- name: Add approved label
|
||||
uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
|
||||
|
||||
68
Makefile
68
Makefile
@@ -53,6 +53,10 @@ endif
|
||||
REGEX_STRING?=''
|
||||
REGEX_PATTERN?=''
|
||||
|
||||
# tfhe-cuda-backend
|
||||
TFHECUDA_SRC="backends/tfhe-cuda-backend/implementation"
|
||||
TFHECUDA_BUILD=$(TFHECUDA_SRC)/build
|
||||
|
||||
# Exclude these files from coverage reports
|
||||
define COVERAGE_EXCLUDED_FILES
|
||||
--exclude-files apps/trivium/src/trivium/* \
|
||||
@@ -137,10 +141,21 @@ check_linelint_installed:
|
||||
fmt: install_rs_check_toolchain
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
|
||||
|
||||
.PHONY: fmt_gpu # Format rust and cuda code
|
||||
fmt_gpu: install_rs_check_toolchain
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
|
||||
cd backends/tfhe-cuda-backend/implementation/ && ./format_tfhe_cuda_backend.sh
|
||||
|
||||
.PHONY: check_fmt # Check rust code format
|
||||
check_fmt: install_rs_check_toolchain
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
|
||||
|
||||
.PHONY: clippy_gpu # Run clippy lints on the gpu backend
|
||||
clippy_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,shortint,gpu \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
|
||||
.PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
|
||||
fix_newline: check_linelint_installed
|
||||
linelint -a .
|
||||
@@ -333,6 +348,23 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
|
||||
-p $(TFHE_SPEC) -- core_crypto::; \
|
||||
fi
|
||||
|
||||
.PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
|
||||
test_gpu: test_core_crypto_gpu test_integer_gpu
|
||||
|
||||
.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
|
||||
test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- core_crypto::gpu::
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- core_crypto::gpu::
|
||||
|
||||
.PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
|
||||
test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- integer::gpu::server_key::
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- integer::gpu::server_key::
|
||||
|
||||
.PHONY: test_boolean # Run the tests of the boolean module
|
||||
test_boolean: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
@@ -498,7 +530,7 @@ docs: doc
|
||||
lint_doc: install_rs_check_toolchain
|
||||
RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer --no-deps
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p tfhe --no-deps
|
||||
|
||||
.PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
|
||||
lint_docs: lint_doc
|
||||
@@ -577,6 +609,20 @@ bench_integer: install_rs_check_toolchain
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_signed_integer # Run benchmarks for signed integer
|
||||
bench_signed_integer: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-signed-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
|
||||
bench_integer_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p tfhe --
|
||||
|
||||
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
|
||||
bench_integer_multi_bit: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
@@ -585,13 +631,6 @@ bench_integer_multi_bit: install_rs_check_toolchain
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_signed_integer # Run benchmarks for signed integer
|
||||
bench_signed_integer: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-signed-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
|
||||
bench_signed_integer_multi_bit: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
@@ -600,6 +639,14 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
|
||||
--bench integer-signed-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
|
||||
bench_integer_multi_bit_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p tfhe --
|
||||
|
||||
.PHONY: bench_shortint # Run benchmarks for shortint
|
||||
bench_shortint: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
|
||||
@@ -715,9 +762,12 @@ sha256_bool: install_rs_check_toolchain
|
||||
--example sha256_bool \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean
|
||||
|
||||
.PHONY: pcc # pcc stands for pre commit checks
|
||||
.PHONY: pcc # pcc stands for pre commit checks (except GPU)
|
||||
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_all check_compile_tests
|
||||
|
||||
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
|
||||
pcc_gpu: pcc clippy_gpu
|
||||
|
||||
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
|
||||
fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_fast check_compile_tests
|
||||
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
# -----------------------------
|
||||
# Options effecting formatting.
|
||||
# -----------------------------
|
||||
with section("format"):
|
||||
|
||||
# How wide to allow formatted cmake files
|
||||
line_width = 120
|
||||
|
||||
# How many spaces to tab for indent
|
||||
tab_size = 2
|
||||
89
backends/tfhe-cuda-backend/implementation/CMakeLists.txt
Normal file
89
backends/tfhe-cuda-backend/implementation/CMakeLists.txt
Normal file
@@ -0,0 +1,89 @@
|
||||
cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
|
||||
project(tfhe_cuda_backend LANGUAGES CXX CUDA)
|
||||
|
||||
# See if the minimum CUDA version is available. If not, only enable documentation building.
|
||||
set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
|
||||
include(CheckLanguage)
|
||||
# See if CUDA is available
|
||||
check_language(CUDA)
|
||||
# If so, enable CUDA to check the version.
|
||||
if(CMAKE_CUDA_COMPILER)
|
||||
enable_language(CUDA)
|
||||
endif()
|
||||
# If CUDA is not available, or the minimum version is too low do not build
|
||||
if(NOT CMAKE_CUDA_COMPILER)
|
||||
message(FATAL_ERROR "Cuda compiler not found.")
|
||||
endif()
|
||||
|
||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS ${MINIMUM_SUPPORTED_CUDA_VERSION})
|
||||
message(FATAL_ERROR "CUDA ${MINIMUM_SUPPORTED_CUDA_VERSION} or greater is required for compilation.")
|
||||
endif()
|
||||
# Get CUDA compute capability
|
||||
set(OUTPUTFILE ${CMAKE_CURRENT_SOURCE_DIR}/cuda_script) # No suffix required
|
||||
set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/check_cuda.cu)
|
||||
execute_process(COMMAND nvcc -lcuda ${CUDAFILE} -o ${OUTPUTFILE})
|
||||
execute_process(
|
||||
COMMAND ${OUTPUTFILE}
|
||||
RESULT_VARIABLE CUDA_RETURN_CODE
|
||||
OUTPUT_VARIABLE ARCH)
|
||||
file(REMOVE ${OUTPUTFILE})
|
||||
|
||||
if(${CUDA_RETURN_CODE} EQUAL 0)
|
||||
set(CUDA_SUCCESS "TRUE")
|
||||
else()
|
||||
set(CUDA_SUCCESS "FALSE")
|
||||
endif()
|
||||
|
||||
if(${CUDA_SUCCESS})
|
||||
message(STATUS "CUDA Architecture: ${ARCH}")
|
||||
message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
|
||||
message(STATUS "CUDA Path: ${CUDA_TOOLKIT_ROOT_DIR}")
|
||||
message(STATUS "CUDA Libraries: ${CUDA_LIBRARIES}")
|
||||
message(STATUS "CUDA Performance Primitives: ${CUDA_npp_LIBRARY}")
|
||||
else()
|
||||
message(WARNING ${ARCH})
|
||||
endif()
|
||||
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
endif()
|
||||
|
||||
# Add OpenMP support
|
||||
find_package(OpenMP REQUIRED)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler ${OpenMP_CXX_FLAGS}")
|
||||
set(CMAKE_CUDA_ARCHITECTURES native)
|
||||
if(NOT CUDA_NVCC_FLAGS)
|
||||
set(CUDA_NVCC_FLAGS -arch=sm_70)
|
||||
endif()
|
||||
|
||||
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
|
||||
set(CMAKE_CUDA_FLAGS
|
||||
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
|
||||
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
|
||||
--use_fast_math -Xcompiler -fPIC")
|
||||
|
||||
set(INCLUDE_DIR include)
|
||||
|
||||
add_subdirectory(src)
|
||||
target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
|
||||
|
||||
# This is required for rust cargo build
|
||||
install(TARGETS tfhe_cuda_backend DESTINATION .)
|
||||
install(TARGETS tfhe_cuda_backend DESTINATION lib)
|
||||
|
||||
# Define a function to add a lint target.
|
||||
find_file(CPPLINT NAMES cpplint cpplint.exe)
|
||||
if(CPPLINT)
|
||||
# Add a custom target to lint all child projects. Dependencies are specified in child projects.
|
||||
add_custom_target(all_lint)
|
||||
# Don't trigger this target on ALL_BUILD or Visual Studio 'Rebuild Solution'
|
||||
set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
|
||||
# set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
|
||||
endif()
|
||||
|
||||
enable_testing()
|
||||
3
backends/tfhe-cuda-backend/implementation/CPPLINT.cfg
Normal file
3
backends/tfhe-cuda-backend/implementation/CPPLINT.cfg
Normal file
@@ -0,0 +1,3 @@
|
||||
set noparent
|
||||
linelength=240
|
||||
filter=-legal/copyright,-readability/todo,-runtime/references,-build/c++17
|
||||
52
backends/tfhe-cuda-backend/implementation/README.md
Normal file
52
backends/tfhe-cuda-backend/implementation/README.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# TFHE Cuda backend
|
||||
|
||||
## Introduction
|
||||
|
||||
The `tfhe-cuda-backend` holds the code for GPU acceleration of Zama's variant of TFHE.
|
||||
It implements CUDA/C++ functions to perform homomorphic operations on LWE ciphertexts.
|
||||
|
||||
It provides functions to allocate memory on the GPU, to copy data back
|
||||
and forth between the CPU and the GPU, to create and destroy Cuda streams, etc.:
|
||||
- `cuda_create_stream`, `cuda_destroy_stream`
|
||||
- `cuda_malloc`, `cuda_check_valid_malloc`
|
||||
- `cuda_memcpy_async_to_cpu`, `cuda_memcpy_async_to_gpu`
|
||||
- `cuda_get_number_of_gpus`
|
||||
- `cuda_synchronize_device`
|
||||
The cryptographic operations it provides are:
|
||||
- an amortized implementation of the TFHE programmable bootstrap: `cuda_bootstrap_amortized_lwe_ciphertext_vector_32` and `cuda_bootstrap_amortized_lwe_ciphertext_vector_64`
|
||||
- a low latency implementation of the TFHE programmable bootstrap: `cuda_bootstrap_low latency_lwe_ciphertext_vector_32` and `cuda_bootstrap_low_latency_lwe_ciphertext_vector_64`
|
||||
- the keyswitch: `cuda_keyswitch_lwe_ciphertext_vector_32` and `cuda_keyswitch_lwe_ciphertext_vector_64`
|
||||
- the larger precision programmable bootstrap (wop PBS, which supports up to 16 bits of message while the classical PBS only supports up to 8 bits of message) and its sub-components: `cuda_wop_pbs_64`, `cuda_extract_bits_64`, `cuda_circuit_bootstrap_64`, `cuda_cmux_tree_64`, `cuda_blind_rotation_sample_extraction_64`
|
||||
- acceleration for leveled operations: `cuda_negate_lwe_ciphertext_vector_64`, `cuda_add_lwe_ciphertext_vector_64`, `cuda_add_lwe_ciphertext_vector_plaintext_vector_64`, `cuda_mult_lwe_ciphertext_vector_cleartext_vector`.
|
||||
|
||||
## Dependencies
|
||||
|
||||
**Disclaimer**: Compilation on Windows/Mac is not supported yet. Only Nvidia GPUs are supported.
|
||||
|
||||
- nvidia driver - for example, if you're running Ubuntu 20.04 check this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-20-04-focal-fossa-linux) for installation
|
||||
- [nvcc](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) >= 10.0
|
||||
- [gcc](https://gcc.gnu.org/) >= 8.0 - check this [page](https://gist.github.com/ax3l/9489132) for more details about nvcc/gcc compatible versions
|
||||
- [cmake](https://cmake.org/) >= 3.24
|
||||
|
||||
## Build
|
||||
|
||||
The Cuda project held in `tfhe-cuda-backend` can be compiled independently from Concrete in the
|
||||
following way:
|
||||
```
|
||||
git clone git@github.com:zama-ai/tfhe-rs
|
||||
cd backends/tfhe-cuda-backend/implementation
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make
|
||||
```
|
||||
The compute capability is detected automatically (with the first GPU information) and set accordingly.
|
||||
|
||||
## Links
|
||||
|
||||
- [TFHE](https://eprint.iacr.org/2018/421.pdf)
|
||||
|
||||
## License
|
||||
|
||||
This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
|
||||
please contact us at `hello@zama.ai`.
|
||||
22
backends/tfhe-cuda-backend/implementation/check_cuda.cu
Normal file
22
backends/tfhe-cuda-backend/implementation/check_cuda.cu
Normal file
@@ -0,0 +1,22 @@
|
||||
#include <stdio.h>
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
cudaDeviceProp dP;
|
||||
float min_cc = 3.0;
|
||||
|
||||
int rc = cudaGetDeviceProperties(&dP, 0);
|
||||
if (rc != cudaSuccess) {
|
||||
cudaError_t error = cudaGetLastError();
|
||||
printf("CUDA error: %s", cudaGetErrorString(error));
|
||||
return rc; /* Failure */
|
||||
}
|
||||
if ((dP.major + (dP.minor / 10)) < min_cc) {
|
||||
printf("Min Compute Capability of %2.1f required: %d.%d found\n Not "
|
||||
"Building CUDA Code",
|
||||
min_cc, dP.major, dP.minor);
|
||||
return 1; /* Failure */
|
||||
} else {
|
||||
printf("-arch=sm_%d%d", dP.major, dP.minor);
|
||||
return 0; /* Success */
|
||||
}
|
||||
}
|
||||
6
backends/tfhe-cuda-backend/implementation/format_tfhe_cuda_backend.sh
Executable file
6
backends/tfhe-cuda-backend/implementation/format_tfhe_cuda_backend.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
|
||||
cmake-format -i CMakeLists.txt -c .cmake-format-config.py
|
||||
|
||||
find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
|
||||
118
backends/tfhe-cuda-backend/implementation/include/bootstrap.h
Normal file
118
backends/tfhe-cuda-backend/implementation/include/bootstrap.h
Normal file
@@ -0,0 +1,118 @@
|
||||
#ifndef CUDA_BOOTSTRAP_H
|
||||
#define CUDA_BOOTSTRAP_H
|
||||
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
|
||||
enum PBS_TYPE { MULTI_BIT = 0, LOW_LAT = 1, AMORTIZED = 2 };
|
||||
|
||||
extern "C" {
|
||||
void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t total_polynomials);
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
void scratch_cuda_bootstrap_amortized_32(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_bootstrap_amortized_64(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
void scratch_cuda_bootstrap_low_latency_32(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_bootstrap_low_latency_64(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
uint64_t get_buffer_size_bootstrap_amortized_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
|
||||
|
||||
uint64_t get_buffer_size_bootstrap_low_latency_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t level_count);
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t level_count);
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
|
||||
|
||||
#endif
|
||||
|
||||
#endif // CUDA_BOOTSTRAP_H
|
||||
@@ -0,0 +1,45 @@
|
||||
#ifndef CUDA_MULTI_BIT_H
|
||||
#define CUDA_MULTI_BIT_H
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
void cuda_convert_lwe_multi_bit_bootstrap_key_64(
|
||||
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor);
|
||||
|
||||
void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t chunk_size = 0);
|
||||
|
||||
void scratch_cuda_multi_bit_pbs_64(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t chunk_size = 0);
|
||||
|
||||
void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer);
|
||||
}
|
||||
#ifdef __CUDACC__
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
|
||||
uint32_t level_count,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t num_samples);
|
||||
|
||||
__host__ uint32_t get_average_lwe_chunk_size(uint32_t lwe_dimension,
|
||||
uint32_t level_count,
|
||||
uint32_t glwe_dimension);
|
||||
|
||||
__host__ uint64_t get_max_buffer_size_multibit_bootstrap(
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_input_lwe_ciphertext_count);
|
||||
#endif
|
||||
|
||||
#endif // CUDA_MULTI_BIT_H
|
||||
@@ -0,0 +1,18 @@
|
||||
#ifndef CUDA_CIPHERTEXT_H
|
||||
#define CUDA_CIPHERTEXT_H
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
|
||||
void *v_stream,
|
||||
uint32_t gpu_index,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension);
|
||||
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
|
||||
void *v_stream,
|
||||
uint32_t gpu_index,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension);
|
||||
};
|
||||
#endif
|
||||
88
backends/tfhe-cuda-backend/implementation/include/device.h
Normal file
88
backends/tfhe-cuda-backend/implementation/include/device.h
Normal file
@@ -0,0 +1,88 @@
|
||||
#ifndef DEVICE_H
|
||||
#define DEVICE_H
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#define synchronize_threads_in_block() __syncthreads()
|
||||
|
||||
extern "C" {
|
||||
|
||||
struct cuda_stream_t {
|
||||
cudaStream_t stream;
|
||||
uint32_t gpu_index;
|
||||
|
||||
cuda_stream_t(uint32_t gpu_index) {
|
||||
this->gpu_index = gpu_index;
|
||||
|
||||
cudaStreamCreate(&stream);
|
||||
}
|
||||
|
||||
void release() {
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaStreamDestroy(stream);
|
||||
}
|
||||
|
||||
void synchronize() { cudaStreamSynchronize(stream); }
|
||||
};
|
||||
|
||||
cuda_stream_t *cuda_create_stream(uint32_t gpu_index);
|
||||
|
||||
int cuda_destroy_stream(cuda_stream_t *stream);
|
||||
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);
|
||||
|
||||
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
int cuda_check_support_cooperative_groups();
|
||||
|
||||
int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size);
|
||||
|
||||
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
|
||||
int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
|
||||
int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size);
|
||||
|
||||
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
|
||||
int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
|
||||
int cuda_get_number_of_gpus();
|
||||
|
||||
int cuda_synchronize_device(uint32_t gpu_index);
|
||||
|
||||
int cuda_drop(void *ptr, uint32_t gpu_index);
|
||||
|
||||
int cuda_drop_async(void *ptr, cuda_stream_t *stream);
|
||||
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
|
||||
int cuda_synchronize_stream(cuda_stream_t *stream);
|
||||
|
||||
#define check_cuda_error(ans) \
|
||||
{ cuda_error((ans), __FILE__, __LINE__); }
|
||||
inline void cuda_error(cudaError_t code, const char *file, int line,
|
||||
bool abort = true) {
|
||||
if (code != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file,
|
||||
line);
|
||||
if (abort)
|
||||
exit(code);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
|
||||
Torus n);
|
||||
#endif
|
||||
@@ -0,0 +1,100 @@
|
||||
#include "cuComplex.h"
|
||||
#include "thrust/complex.h"
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
|
||||
#define PRINT_VARS
|
||||
#ifdef PRINT_VARS
|
||||
#define PRINT_DEBUG_5(var, begin, end, step, cond) \
|
||||
_print_debug(var, #var, begin, end, step, cond, "", false)
|
||||
#define PRINT_DEBUG_6(var, begin, end, step, cond, text) \
|
||||
_print_debug(var, #var, begin, end, step, cond, text, true)
|
||||
#define CAT(A, B) A##B
|
||||
#define PRINT_SELECT(NAME, NUM) CAT(NAME##_, NUM)
|
||||
#define GET_COUNT(_1, _2, _3, _4, _5, _6, COUNT, ...) COUNT
|
||||
#define VA_SIZE(...) GET_COUNT(__VA_ARGS__, 6, 5, 4, 3, 2, 1)
|
||||
#define PRINT_DEBUG(...) \
|
||||
PRINT_SELECT(PRINT_DEBUG, VA_SIZE(__VA_ARGS__))(__VA_ARGS__)
|
||||
#else
|
||||
#define PRINT_DEBUG(...)
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
__device__ typename std::enable_if<std::is_unsigned<T>::value, void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %u\n", var_name, i, var[i]);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ typename std::enable_if<std::is_signed<T>::value, void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %d\n", var_name, i, var[i]);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ typename std::enable_if<std::is_floating_point<T>::value, void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %.15f\n", var_name, i, var[i]);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__
|
||||
typename std::enable_if<std::is_same<T, thrust::complex<double>>::value,
|
||||
void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].real(),
|
||||
var[i].imag());
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__
|
||||
typename std::enable_if<std::is_same<T, cuDoubleComplex>::value, void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].x, var[i].y);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
1303
backends/tfhe-cuda-backend/implementation/include/integer.h
Normal file
1303
backends/tfhe-cuda-backend/implementation/include/integer.h
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,21 @@
|
||||
#ifndef CNCRT_KS_H_
|
||||
#define CNCRT_KS_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
}
|
||||
|
||||
#endif // CNCRT_KS_H_
|
||||
@@ -0,0 +1,50 @@
|
||||
#ifndef CUDA_LINALG_H_
|
||||
#define CUDA_LINALG_H_
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include <cstdint>
|
||||
#include <device.h>
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
#endif // CUDA_LINALG_H_
|
||||
22
backends/tfhe-cuda-backend/implementation/src/CMakeLists.txt
Normal file
22
backends/tfhe-cuda-backend/implementation/src/CMakeLists.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
set(SOURCES
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
|
||||
file(GLOB_RECURSE SOURCES "*.cu")
|
||||
add_library(tfhe_cuda_backend STATIC ${SOURCES})
|
||||
set_target_properties(
|
||||
tfhe_cuda_backend
|
||||
PROPERTIES CUDA_SEPARABLE_COMPILATION ON
|
||||
CUDA_RESOLVE_DEVICE_SYMBOLS ON
|
||||
CUDA_ARCHITECTURES native)
|
||||
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
|
||||
target_include_directories(tfhe_cuda_backend PRIVATE .)
|
||||
@@ -0,0 +1 @@
|
||||
#include "ciphertext.cuh"
|
||||
@@ -0,0 +1,44 @@
|
||||
#ifndef CUDA_CIPHERTEXT_CUH
|
||||
#define CUDA_CIPHERTEXT_CUH
|
||||
|
||||
#include "ciphertext.h"
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
|
||||
template <typename T>
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu(T *dest, T *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
|
||||
cuda_memcpy_async_to_gpu(dest, src, size, stream);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
|
||||
(uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void cuda_convert_lwe_ciphertext_vector_to_cpu(T *dest, T *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
|
||||
cuda_memcpy_async_to_cpu(dest, src, size, stream);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
|
||||
(uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
|
||||
}
|
||||
|
||||
#endif
|
||||
162
backends/tfhe-cuda-backend/implementation/src/crypto/gadget.cuh
Normal file
162
backends/tfhe-cuda-backend/implementation/src/crypto/gadget.cuh
Normal file
@@ -0,0 +1,162 @@
|
||||
#ifndef CNCRT_CRYPTO_CUH
|
||||
#define CNCRT_CRPYTO_CUH
|
||||
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
|
||||
/**
|
||||
* GadgetMatrix implements the iterator design pattern to decompose a set of
|
||||
* num_poly consecutive polynomials with degree params::degree. A total of
|
||||
* level_count levels is expected and each call to decompose_and_compress_next()
|
||||
* writes to the result the next level. It is also possible to advance an
|
||||
* arbitrary amount of levels by using decompose_and_compress_level().
|
||||
*
|
||||
* This class always decomposes the entire set of num_poly polynomials.
|
||||
* By default, it works on a single polynomial.
|
||||
*/
|
||||
#pragma once
|
||||
template <typename T, class params> class GadgetMatrix {
|
||||
private:
|
||||
uint32_t level_count;
|
||||
uint32_t base_log;
|
||||
uint32_t mask;
|
||||
uint32_t halfbg;
|
||||
uint32_t num_poly;
|
||||
T offset;
|
||||
int current_level;
|
||||
T mask_mod_b;
|
||||
T *state;
|
||||
|
||||
public:
|
||||
__device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
|
||||
uint32_t num_poly = 1)
|
||||
: base_log(base_log), level_count(level_count), num_poly(num_poly),
|
||||
state(state) {
|
||||
|
||||
mask_mod_b = (1ll << base_log) - 1ll;
|
||||
current_level = level_count;
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < num_poly * params::opt; i++) {
|
||||
state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
// Decomposes all polynomials at once
|
||||
__device__ void decompose_and_compress_next(double2 *result) {
|
||||
for (int j = 0; j < num_poly; j++) {
|
||||
auto result_slice = result + j * params::degree / 2;
|
||||
decompose_and_compress_next_polynomial(result_slice, j);
|
||||
}
|
||||
}
|
||||
|
||||
// Decomposes a single polynomial
|
||||
__device__ void decompose_and_compress_next_polynomial(double2 *result,
|
||||
int j) {
|
||||
if (j == 0)
|
||||
current_level -= 1;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
auto state_slice = state + j * params::degree;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
T res_re = state_slice[tid] & mask_mod_b;
|
||||
T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
|
||||
state_slice[tid] >>= base_log;
|
||||
state_slice[tid + params::degree / 2] >>= base_log;
|
||||
T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
|
||||
T carry_im =
|
||||
((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
|
||||
carry_re >>= (base_log - 1);
|
||||
carry_im >>= (base_log - 1);
|
||||
state_slice[tid] += carry_re;
|
||||
state_slice[tid + params::degree / 2] += carry_im;
|
||||
res_re -= carry_re << base_log;
|
||||
res_im -= carry_im << base_log;
|
||||
|
||||
result[tid].x = (int32_t)res_re;
|
||||
result[tid].y = (int32_t)res_im;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
// Decomposes a single polynomial
|
||||
__device__ void
|
||||
decompose_and_compress_next_polynomial_elements(double2 *result, int j) {
|
||||
if (j == 0)
|
||||
current_level -= 1;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
auto state_slice = state + j * params::degree;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
T res_re = state_slice[tid] & mask_mod_b;
|
||||
T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
|
||||
state_slice[tid] >>= base_log;
|
||||
state_slice[tid + params::degree / 2] >>= base_log;
|
||||
T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
|
||||
T carry_im =
|
||||
((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
|
||||
carry_re >>= (base_log - 1);
|
||||
carry_im >>= (base_log - 1);
|
||||
state_slice[tid] += carry_re;
|
||||
state_slice[tid + params::degree / 2] += carry_im;
|
||||
res_re -= carry_re << base_log;
|
||||
res_im -= carry_im << base_log;
|
||||
|
||||
result[i].x = (int32_t)res_re;
|
||||
result[i].y = (int32_t)res_im;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
__device__ void decompose_and_compress_level(double2 *result, int level) {
|
||||
for (int i = 0; i < level_count - level; i++)
|
||||
decompose_and_compress_next(result);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> class GadgetMatrixSingle {
|
||||
private:
|
||||
uint32_t level_count;
|
||||
uint32_t base_log;
|
||||
uint32_t mask;
|
||||
uint32_t halfbg;
|
||||
T offset;
|
||||
|
||||
public:
|
||||
__device__ GadgetMatrixSingle(uint32_t base_log, uint32_t level_count)
|
||||
: base_log(base_log), level_count(level_count) {
|
||||
uint32_t bg = 1 << base_log;
|
||||
this->halfbg = bg / 2;
|
||||
this->mask = bg - 1;
|
||||
T temp = 0;
|
||||
for (int i = 0; i < this->level_count; i++) {
|
||||
temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
|
||||
}
|
||||
this->offset = temp * this->halfbg;
|
||||
}
|
||||
|
||||
__device__ T decompose_one_level_single(T element, uint32_t level) {
|
||||
T s = element + this->offset;
|
||||
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
|
||||
T temp1 = (s >> decal) & this->mask;
|
||||
return (T)(temp1 - this->halfbg);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus>
|
||||
__device__ Torus decompose_one(Torus &state, Torus mask_mod_b, int base_log) {
|
||||
Torus res = state & mask_mod_b;
|
||||
state >>= base_log;
|
||||
Torus carry = ((res - 1ll) | state) & res;
|
||||
carry >>= base_log - 1;
|
||||
state += carry;
|
||||
res -= carry << base_log;
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif // CNCRT_CRPYTO_H
|
||||
@@ -0,0 +1,74 @@
|
||||
#ifndef CNCRT_GGSW_CUH
|
||||
#define CNCRT_GGSW_CUH
|
||||
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
|
||||
template <typename T, typename ST, class params, sharedMemDegree SMD>
|
||||
__global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
|
||||
int8_t *device_mem) {
|
||||
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
double2 *selected_memory;
|
||||
|
||||
if constexpr (SMD == FULLSM)
|
||||
selected_memory = (double2 *)sharedmem;
|
||||
else
|
||||
selected_memory = (double2 *)device_mem[blockIdx.x * params::degree];
|
||||
|
||||
// Compression
|
||||
int offset = blockIdx.x * blockDim.x;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
ST x = src[(tid) + params::opt * offset];
|
||||
ST y = src[(tid + params::degree / 2) + params::opt * offset];
|
||||
selected_memory[tid].x = x / (double)std::numeric_limits<T>::max();
|
||||
selected_memory[tid].y = y / (double)std::numeric_limits<T>::max();
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(selected_memory);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Write the output to global memory
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < params::opt / 2; j++) {
|
||||
dest[tid + (params::opt >> 1) * offset] = selected_memory[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies the FFT transform on sequence of GGSW ciphertexts already in the
|
||||
* global memory
|
||||
*/
|
||||
template <typename T, typename ST, class params>
|
||||
void batch_fft_ggsw_vector(cuda_stream_t *stream, double2 *dest, T *src,
|
||||
int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t gpu_index, uint32_t max_shared_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
int shared_memory_size = sizeof(double) * polynomial_size;
|
||||
|
||||
int gridSize = r * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
|
||||
int blockSize = polynomial_size / params::opt;
|
||||
|
||||
if (max_shared_memory < shared_memory_size) {
|
||||
device_batch_fft_ggsw_vector<T, ST, params, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(dest, src, d_mem);
|
||||
} else {
|
||||
device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(dest, src,
|
||||
d_mem);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif // CNCRT_GGSW_CUH
|
||||
@@ -0,0 +1,48 @@
|
||||
#include "keyswitch.cuh"
|
||||
#include "keyswitch.h"
|
||||
#include <cstdint>
|
||||
|
||||
/* Perform keyswitch on a batch of 32 bits input LWE ciphertexts.
|
||||
* Head out to the equivalent operation on 64 bits for more details.
|
||||
*/
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
|
||||
}
|
||||
|
||||
/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
|
||||
*
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - lwe_array_out: output batch of num_samples keyswitched ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the output LWE dimension (lwe_dimension_out)
|
||||
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing
|
||||
* lwe_dimension_in mask values + 1 body value
|
||||
* - ksk: the keyswitch key to be used in the operation
|
||||
* - base log: the log of the base used in the decomposition (should be the one
|
||||
* used to create the ksk)
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the keyswitch
|
||||
* - num_samples blocks of threads are launched
|
||||
*/
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
|
||||
}
|
||||
@@ -0,0 +1,144 @@
|
||||
#ifndef CNCRT_KS_CUH
|
||||
#define CNCRT_KS_CUH
|
||||
|
||||
#include "device.h"
|
||||
#include "gadget.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "torus.cuh"
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus>
|
||||
__device__ Torus *get_ith_block(Torus *ksk, int i, int level,
|
||||
uint32_t lwe_dimension_out,
|
||||
uint32_t level_count) {
|
||||
int pos = i * level_count * (lwe_dimension_out + 1) +
|
||||
level * (lwe_dimension_out + 1);
|
||||
Torus *ptr = &ksk[pos];
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/*
|
||||
* keyswitch kernel
|
||||
* Each thread handles a piece of the following equation:
|
||||
* $$GLWE_s2(\Delta.m+e) = (0,0,..,0,b) - \sum_{i=0,k-1} <Dec(a_i),
|
||||
* (GLWE_s2(s1_i q/beta),..,GLWE(s1_i q/beta^l)>$$ where k is the dimension of
|
||||
* the GLWE ciphertext. If the polynomial dimension in GLWE is > 1, this
|
||||
* equation is solved for each polynomial coefficient. where Dec denotes the
|
||||
* decomposition with base beta and l levels and the inner product is done
|
||||
* between the decomposition of a_i and l GLWE encryptions of s1_i q/\beta^j,
|
||||
* with j in [1,l] We obtain a GLWE encryption of Delta.m (with Delta the
|
||||
* scaling factor) under key s2 instead of s1, with an increased noise
|
||||
*
|
||||
*/
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
|
||||
int lwe_lower, int lwe_upper, int cutoff) {
|
||||
int tid = threadIdx.x;
|
||||
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
|
||||
Torus *local_lwe_array_out = (Torus *)sharedmem;
|
||||
|
||||
auto block_lwe_array_in = get_chunk(
|
||||
lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
|
||||
auto block_lwe_array_out = get_chunk(
|
||||
lwe_array_out, lwe_output_indexes[blockIdx.x], lwe_dimension_out + 1);
|
||||
|
||||
auto gadget = GadgetMatrixSingle<Torus>(base_log, level_count);
|
||||
|
||||
int lwe_part_per_thd;
|
||||
if (tid < cutoff) {
|
||||
lwe_part_per_thd = lwe_upper;
|
||||
} else {
|
||||
lwe_part_per_thd = lwe_lower;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int k = 0; k < lwe_part_per_thd; k++) {
|
||||
int idx = tid + k * blockDim.x;
|
||||
local_lwe_array_out[idx] = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (tid == 0) {
|
||||
local_lwe_array_out[lwe_dimension_out] =
|
||||
block_lwe_array_in[lwe_dimension_in];
|
||||
}
|
||||
|
||||
for (int i = 0; i < lwe_dimension_in; i++) {
|
||||
|
||||
__syncthreads();
|
||||
|
||||
Torus a_i =
|
||||
round_to_closest_multiple(block_lwe_array_in[i], base_log, level_count);
|
||||
|
||||
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
|
||||
Torus mask_mod_b = (1ll << base_log) - 1ll;
|
||||
|
||||
for (int j = 0; j < level_count; j++) {
|
||||
auto ksk_block = get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
|
||||
Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
|
||||
for (int k = 0; k < lwe_part_per_thd; k++) {
|
||||
int idx = tid + k * blockDim.x;
|
||||
local_lwe_array_out[idx] -= (Torus)ksk_block[idx] * decomposed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int k = 0; k < lwe_part_per_thd; k++) {
|
||||
int idx = tid + k * blockDim.x;
|
||||
block_lwe_array_out[idx] = local_lwe_array_out[idx];
|
||||
}
|
||||
}
|
||||
|
||||
/// assume lwe_array_in in the gpu
|
||||
template <typename Torus>
|
||||
__host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
constexpr int ideal_threads = 128;
|
||||
|
||||
int lwe_dim = lwe_dimension_out + 1;
|
||||
int lwe_lower, lwe_upper, cutoff;
|
||||
if (lwe_dim % ideal_threads == 0) {
|
||||
lwe_lower = lwe_dim / ideal_threads;
|
||||
lwe_upper = lwe_dim / ideal_threads;
|
||||
cutoff = 0;
|
||||
} else {
|
||||
int y =
|
||||
ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
|
||||
cutoff = ideal_threads - y;
|
||||
lwe_lower = lwe_dim / ideal_threads;
|
||||
lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
|
||||
}
|
||||
|
||||
int lwe_size_after = (lwe_dimension_out + 1) * num_samples;
|
||||
|
||||
int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1);
|
||||
|
||||
cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
dim3 grid(num_samples, 1, 1);
|
||||
dim3 threads(ideal_threads, 1, 1);
|
||||
|
||||
// cudaFuncSetAttribute(keyswitch<Torus>,
|
||||
// cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
// shared_mem);
|
||||
|
||||
keyswitch<<<grid, threads, shared_mem, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower,
|
||||
lwe_upper, cutoff);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,74 @@
|
||||
#ifndef CNCRT_TORUS_CUH
|
||||
#define CNCRT_TORUS_CUH
|
||||
|
||||
#include "types/int128.cuh"
|
||||
#include <limits>
|
||||
|
||||
template <typename T>
|
||||
__device__ inline void typecast_double_to_torus(double x, T &r) {
|
||||
r = T(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline void typecast_double_to_torus<uint32_t>(double x,
|
||||
uint32_t &r) {
|
||||
r = __double2uint_rn(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline void typecast_double_to_torus<uint64_t>(double x,
|
||||
uint64_t &r) {
|
||||
// The ull intrinsic does not behave in the same way on all architectures and
|
||||
// on some platforms this causes the cmux tree test to fail
|
||||
// Hence the intrinsic is not used here
|
||||
uint128 nnnn = make_uint128_from_float(x);
|
||||
uint64_t lll = nnnn.lo_;
|
||||
r = lll;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
T shift = sizeof(T) * 8 - level_count * base_log;
|
||||
T mask = 1ll << (shift - 1);
|
||||
T b = (x & mask) >> (shift - 1);
|
||||
T res = x >> shift;
|
||||
res += b;
|
||||
res <<= shift;
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void rescale_torus_element(T element, T &output,
|
||||
uint32_t log_shift) {
|
||||
output =
|
||||
round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
|
||||
(double)log_shift);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T rescale_torus_element(T element,
|
||||
uint32_t log_shift) {
|
||||
return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
|
||||
(double)log_shift);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void
|
||||
rescale_torus_element<uint32_t>(uint32_t element, uint32_t &output,
|
||||
uint32_t log_shift) {
|
||||
output =
|
||||
round(__uint2double_rn(element) /
|
||||
(__uint2double_rn(std::numeric_limits<uint32_t>::max()) + 1.0) *
|
||||
__uint2double_rn(log_shift));
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void
|
||||
rescale_torus_element<uint64_t>(uint64_t element, uint64_t &output,
|
||||
uint32_t log_shift) {
|
||||
output = round(__ull2double_rn(element) /
|
||||
(__ull2double_rn(std::numeric_limits<uint64_t>::max()) + 1.0) *
|
||||
__uint2double_rn(log_shift));
|
||||
}
|
||||
#endif // CNCRT_TORUS_H
|
||||
350
backends/tfhe-cuda-backend/implementation/src/device.cu
Normal file
350
backends/tfhe-cuda-backend/implementation/src/device.cu
Normal file
@@ -0,0 +1,350 @@
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
/// Unsafe function to create a CUDA stream, must check first that GPU exists
|
||||
cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_stream_t *stream = new cuda_stream_t(gpu_index);
|
||||
return stream;
|
||||
}
|
||||
|
||||
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
|
||||
int cuda_destroy_stream(cuda_stream_t *stream) {
|
||||
stream->release();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Unsafe function that will try to allocate even if gpu_index is invalid
|
||||
/// or if there's not enough memory. A safe wrapper around it must call
|
||||
/// cuda_check_valid_malloc() first
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
cudaSetDevice(gpu_index);
|
||||
void *ptr;
|
||||
cudaMalloc((void **)&ptr, size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// Allocates a size-byte array at the device memory. Tries to do it
|
||||
/// asynchronously.
|
||||
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
void *ptr;
|
||||
|
||||
#ifndef CUDART_VERSION
|
||||
#error CUDART_VERSION Undefined!
|
||||
#elif (CUDART_VERSION >= 11020)
|
||||
int support_async_alloc;
|
||||
check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
|
||||
cudaDevAttrMemoryPoolsSupported,
|
||||
stream->gpu_index));
|
||||
|
||||
if (support_async_alloc) {
|
||||
check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream->stream));
|
||||
} else {
|
||||
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||
}
|
||||
#else
|
||||
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||
#endif
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// Checks that allocation is valid
|
||||
/// 0: valid
|
||||
/// -1: invalid, not enough memory in device
|
||||
/// -2: invalid, gpu index doesn't exist
|
||||
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
size_t total_mem, free_mem;
|
||||
cudaMemGetInfo(&free_mem, &total_mem);
|
||||
if (size > free_mem) {
|
||||
// error code: not enough memory
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Returns
|
||||
/// -> 0 if Cooperative Groups is not supported.
|
||||
/// -> 1 otherwise
|
||||
int cuda_check_support_cooperative_groups() {
|
||||
int cooperative_groups_supported = 0;
|
||||
cudaDeviceGetAttribute(&cooperative_groups_supported,
|
||||
cudaDevAttrCooperativeLaunch, 0);
|
||||
|
||||
return cooperative_groups_supported > 0;
|
||||
}
|
||||
|
||||
/// Tries to copy memory to the GPU asynchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
/// -2: error, gpu index doesn't exist
|
||||
/// -3: error, zero copy size
|
||||
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
}
|
||||
|
||||
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, dest);
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Tries to copy memory to the GPU synchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
/// -2: error, gpu index doesn't exist
|
||||
/// -3: error, zero copy size
|
||||
int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
}
|
||||
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, dest);
|
||||
if (attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
|
||||
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Tries to copy memory to the CPU synchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
/// -2: error, gpu index doesn't exist
|
||||
/// -3: error, zero copy size
|
||||
int cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
}
|
||||
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, src);
|
||||
if (attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
|
||||
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToHost));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Tries to copy memory within a GPU asynchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
/// -2: error, gpu index doesn't exist
|
||||
/// -3: error, zero copy size
|
||||
int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
}
|
||||
|
||||
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaPointerAttributes attr_dest;
|
||||
cudaPointerGetAttributes(&attr_dest, dest);
|
||||
if (attr_dest.device != stream->gpu_index &&
|
||||
attr_dest.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
cudaPointerAttributes attr_src;
|
||||
cudaPointerGetAttributes(&attr_src, src);
|
||||
if (attr_src.device != stream->gpu_index &&
|
||||
attr_src.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
if (attr_src.device != attr_dest.device) {
|
||||
// error code: different devices
|
||||
return -1;
|
||||
}
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice,
|
||||
stream->stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Synchronizes device
|
||||
/// 0: success
|
||||
/// -2: error, gpu index doesn't exist
|
||||
int cuda_synchronize_device(uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaDeviceSynchronize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
}
|
||||
|
||||
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, dest);
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (index < n)
|
||||
array[index] = value;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
|
||||
Torus n) {
|
||||
int block_size = 256;
|
||||
int num_blocks = (n + block_size - 1) / block_size;
|
||||
|
||||
// Launch the kernel
|
||||
cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
|
||||
n);
|
||||
}
|
||||
|
||||
/// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
|
||||
template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array,
|
||||
uint64_t value, uint64_t n);
|
||||
template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,
|
||||
uint32_t value, uint32_t n);
|
||||
|
||||
/// Tries to copy memory to the GPU asynchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
/// -2: error, gpu index doesn't exist
|
||||
/// -3: error, zero copy size
|
||||
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
}
|
||||
|
||||
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, src);
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Return number of GPUs available
|
||||
int cuda_get_number_of_gpus() {
|
||||
int num_gpus;
|
||||
cudaGetDeviceCount(&num_gpus);
|
||||
return num_gpus;
|
||||
}
|
||||
|
||||
/// Drop a cuda array
|
||||
int cuda_drop(void *ptr, uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Drop a cuda array. Tries to do it asynchronously
|
||||
int cuda_drop_async(void *ptr, cuda_stream_t *stream) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
#ifndef CUDART_VERSION
|
||||
#error CUDART_VERSION Undefined!
|
||||
#elif (CUDART_VERSION >= 11020)
|
||||
int support_async_alloc;
|
||||
check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
|
||||
cudaDevAttrMemoryPoolsSupported,
|
||||
stream->gpu_index));
|
||||
|
||||
if (support_async_alloc) {
|
||||
check_cuda_error(cudaFreeAsync(ptr, stream->stream));
|
||||
} else {
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
}
|
||||
#else
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Get the maximum size for the shared memory
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaDeviceProp prop;
|
||||
cudaGetDeviceProperties(&prop, gpu_index);
|
||||
int max_shared_memory = 0;
|
||||
if (prop.major >= 6) {
|
||||
max_shared_memory = prop.sharedMemPerMultiprocessor;
|
||||
} else {
|
||||
max_shared_memory = prop.sharedMemPerBlock;
|
||||
}
|
||||
return max_shared_memory;
|
||||
}
|
||||
|
||||
int cuda_synchronize_stream(cuda_stream_t *stream) {
|
||||
stream->synchronize();
|
||||
return 0;
|
||||
}
|
||||
725
backends/tfhe-cuda-backend/implementation/src/fft/bnsmfft.cuh
Normal file
725
backends/tfhe-cuda-backend/implementation/src/fft/bnsmfft.cuh
Normal file
@@ -0,0 +1,725 @@
|
||||
#ifndef GPU_BOOTSTRAP_FFT_CUH
|
||||
#define GPU_BOOTSTRAP_FFT_CUH
|
||||
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "twiddles.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
/*
|
||||
* Direct negacyclic FFT:
|
||||
* - before the FFT the N real coefficients are stored into a
|
||||
* N/2 sized complex with the even coefficients in the real part
|
||||
* and the odd coefficients in the imaginary part. This is referred to
|
||||
* as the half-size FFT
|
||||
* - when calling BNSMFFT_direct for the forward negacyclic FFT of PBS,
|
||||
* opt is divided by 2 because the butterfly pattern is always applied
|
||||
* between pairs of coefficients
|
||||
* - instead of twisting each coefficient A_j before the FFT by
|
||||
* multiplying by the w^j roots of unity (aka twiddles, w=exp(-i pi /N)),
|
||||
* the FFT is modified, and for each level k of the FFT the twiddle:
|
||||
* w_j,k = exp(-i pi j/2^k)
|
||||
* is replaced with:
|
||||
* \zeta_j,k = exp(-i pi (2j-1)/2^k)
|
||||
*/
|
||||
template <class params> __device__ void NSMFFT_direct(double2 *A) {
|
||||
|
||||
/* We don't make bit reverse here, since twiddles are already reversed
|
||||
* Each thread is always in charge of "opt/2" pairs of coefficients,
|
||||
* which is why we always loop through N/2 by N/opt strides
|
||||
* The pragma unroll instruction tells the compiler to unroll the
|
||||
* full loop, which should increase performance
|
||||
*/
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
size_t twid_id;
|
||||
size_t i1, i2;
|
||||
double2 u, v, w;
|
||||
// level 1
|
||||
// we don't make actual complex multiplication on level1 since we have only
|
||||
// one twiddle, it's real and image parts are equal, so we can multiply
|
||||
// it with simpler operations
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
i1 = tid;
|
||||
i2 = tid + params::degree / 2;
|
||||
|
||||
u = A[i1];
|
||||
v = A[i2] * (double2){0.707106781186547461715008466854,
|
||||
0.707106781186547461715008466854};
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 2
|
||||
// from this level there are more than one twiddles and none of them has equal
|
||||
// real and imag parts, so complete complex multiplication is needed
|
||||
// for each level params::degree / 2^level represents number of coefficients
|
||||
// inside divided chunk of specific level
|
||||
//
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 4);
|
||||
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
|
||||
i2 = i1 + params::degree / 4;
|
||||
|
||||
w = negtwiddles[twid_id + 2];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 3
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 8);
|
||||
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
|
||||
i2 = i1 + params::degree / 8;
|
||||
|
||||
w = negtwiddles[twid_id + 4];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 4
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 16);
|
||||
i1 =
|
||||
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
|
||||
i2 = i1 + params::degree / 16;
|
||||
|
||||
w = negtwiddles[twid_id + 8];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 5
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 32);
|
||||
i1 =
|
||||
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
|
||||
i2 = i1 + params::degree / 32;
|
||||
|
||||
w = negtwiddles[twid_id + 16];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 6
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 64);
|
||||
i1 =
|
||||
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
|
||||
i2 = i1 + params::degree / 64;
|
||||
|
||||
w = negtwiddles[twid_id + 32];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 7
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 128);
|
||||
i1 = 2 * (params::degree / 128) * twid_id +
|
||||
(tid & (params::degree / 128 - 1));
|
||||
i2 = i1 + params::degree / 128;
|
||||
|
||||
w = negtwiddles[twid_id + 64];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// from level 8, we need to check size of params degree, because we support
|
||||
// minimum actual polynomial size = 256, when compressed size is halfed and
|
||||
// minimum supported compressed size is 128, so we always need first 7
|
||||
// levels of butterfy operation, since butterfly levels are hardcoded
|
||||
// we need to check if polynomial size is big enough to require specific level
|
||||
// of butterfly.
|
||||
if constexpr (params::degree >= 256) {
|
||||
// level 8
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 256);
|
||||
i1 = 2 * (params::degree / 256) * twid_id +
|
||||
(tid & (params::degree / 256 - 1));
|
||||
i2 = i1 + params::degree / 256;
|
||||
|
||||
w = negtwiddles[twid_id + 128];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 512) {
|
||||
// level 9
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 512);
|
||||
i1 = 2 * (params::degree / 512) * twid_id +
|
||||
(tid & (params::degree / 512 - 1));
|
||||
i2 = i1 + params::degree / 512;
|
||||
|
||||
w = negtwiddles[twid_id + 256];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 1024) {
|
||||
// level 10
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 1024);
|
||||
i1 = 2 * (params::degree / 1024) * twid_id +
|
||||
(tid & (params::degree / 1024 - 1));
|
||||
i2 = i1 + params::degree / 1024;
|
||||
|
||||
w = negtwiddles[twid_id + 512];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 2048) {
|
||||
// level 11
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 2048);
|
||||
i1 = 2 * (params::degree / 2048) * twid_id +
|
||||
(tid & (params::degree / 2048 - 1));
|
||||
i2 = i1 + params::degree / 2048;
|
||||
|
||||
w = negtwiddles[twid_id + 1024];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 4096) {
|
||||
// level 12
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 4096);
|
||||
i1 = 2 * (params::degree / 4096) * twid_id +
|
||||
(tid & (params::degree / 4096 - 1));
|
||||
i2 = i1 + params::degree / 4096;
|
||||
|
||||
w = negtwiddles[twid_id + 2048];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// compressed size = 8192 is actual polynomial size = 16384.
|
||||
// from this size, twiddles can't fit in constant memory,
|
||||
// so from here, butterfly operation access device memory.
|
||||
if constexpr (params::degree >= 8192) {
|
||||
// level 13
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 8192);
|
||||
i1 = 2 * (params::degree / 8192) * twid_id +
|
||||
(tid & (params::degree / 8192 - 1));
|
||||
i2 = i1 + params::degree / 8192;
|
||||
|
||||
w = negtwiddles13[twid_id];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* negacyclic inverse fft
|
||||
*/
|
||||
template <class params> __device__ void NSMFFT_inverse(double2 *A) {
|
||||
|
||||
/* We don't make bit reverse here, since twiddles are already reversed
|
||||
* Each thread is always in charge of "opt/2" pairs of coefficients,
|
||||
* which is why we always loop through N/2 by N/opt strides
|
||||
* The pragma unroll instruction tells the compiler to unroll the
|
||||
* full loop, which should increase performance
|
||||
*/
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
size_t twid_id;
|
||||
size_t i1, i2;
|
||||
double2 u, w;
|
||||
|
||||
// divide input by compressed polynomial size
|
||||
tid = threadIdx.x;
|
||||
for (size_t i = 0; i < params::opt; ++i) {
|
||||
A[tid] /= params::degree;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// none of the twiddles have equal real and imag part, so
|
||||
// complete complex multiplication has to be done
|
||||
// here we have more than one twiddle
|
||||
// mapping in backward fft is reversed
|
||||
// butterfly operation is started from last level
|
||||
|
||||
// compressed size = 8192 is actual polynomial size = 16384.
|
||||
// twiddles for this size can't fit in constant memory so
|
||||
// butterfly operation for this level acess device memory to fetch
|
||||
// twiddles
|
||||
if constexpr (params::degree >= 8192) {
|
||||
// level 13
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 8192);
|
||||
i1 = 2 * (params::degree / 8192) * twid_id +
|
||||
(tid & (params::degree / 8192 - 1));
|
||||
i2 = i1 + params::degree / 8192;
|
||||
|
||||
w = negtwiddles13[twid_id];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 4096) {
|
||||
// level 12
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 4096);
|
||||
i1 = 2 * (params::degree / 4096) * twid_id +
|
||||
(tid & (params::degree / 4096 - 1));
|
||||
i2 = i1 + params::degree / 4096;
|
||||
|
||||
w = negtwiddles[twid_id + 2048];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 2048) {
|
||||
// level 11
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 2048);
|
||||
i1 = 2 * (params::degree / 2048) * twid_id +
|
||||
(tid & (params::degree / 2048 - 1));
|
||||
i2 = i1 + params::degree / 2048;
|
||||
|
||||
w = negtwiddles[twid_id + 1024];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 1024) {
|
||||
// level 10
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 1024);
|
||||
i1 = 2 * (params::degree / 1024) * twid_id +
|
||||
(tid & (params::degree / 1024 - 1));
|
||||
i2 = i1 + params::degree / 1024;
|
||||
|
||||
w = negtwiddles[twid_id + 512];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 512) {
|
||||
// level 9
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 512);
|
||||
i1 = 2 * (params::degree / 512) * twid_id +
|
||||
(tid & (params::degree / 512 - 1));
|
||||
i2 = i1 + params::degree / 512;
|
||||
|
||||
w = negtwiddles[twid_id + 256];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 256) {
|
||||
// level 8
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 256);
|
||||
i1 = 2 * (params::degree / 256) * twid_id +
|
||||
(tid & (params::degree / 256 - 1));
|
||||
i2 = i1 + params::degree / 256;
|
||||
|
||||
w = negtwiddles[twid_id + 128];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// below level 8, we don't need to check size of params degree, because we
|
||||
// support minimum actual polynomial size = 256, when compressed size is
|
||||
// halfed and minimum supported compressed size is 128, so we always need
|
||||
// last 7 levels of butterfy operation, since butterfly levels are hardcoded
|
||||
// we don't need to check if polynomial size is big enough to require
|
||||
// specific level of butterfly.
|
||||
// level 7
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 128);
|
||||
i1 = 2 * (params::degree / 128) * twid_id +
|
||||
(tid & (params::degree / 128 - 1));
|
||||
i2 = i1 + params::degree / 128;
|
||||
|
||||
w = negtwiddles[twid_id + 64];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 6
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 64);
|
||||
i1 =
|
||||
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
|
||||
i2 = i1 + params::degree / 64;
|
||||
|
||||
w = negtwiddles[twid_id + 32];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 5
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 32);
|
||||
i1 =
|
||||
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
|
||||
i2 = i1 + params::degree / 32;
|
||||
|
||||
w = negtwiddles[twid_id + 16];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 4
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 16);
|
||||
i1 =
|
||||
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
|
||||
i2 = i1 + params::degree / 16;
|
||||
|
||||
w = negtwiddles[twid_id + 8];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 3
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 8);
|
||||
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
|
||||
i2 = i1 + params::degree / 8;
|
||||
|
||||
w = negtwiddles[twid_id + 4];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 2
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 4);
|
||||
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
|
||||
i2 = i1 + params::degree / 4;
|
||||
|
||||
w = negtwiddles[twid_id + 2];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 1
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 2);
|
||||
i1 = 2 * (params::degree / 2) * twid_id + (tid & (params::degree / 2 - 1));
|
||||
i2 = i1 + params::degree / 2;
|
||||
|
||||
w = negtwiddles[twid_id + 1];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
/*
|
||||
* global batch fft
|
||||
* does fft in half size
|
||||
* unrolling half size fft result in half size + 1 elements
|
||||
* this function must be called with actual degree
|
||||
* function takes as input already compressed input
|
||||
*/
|
||||
template <class params, sharedMemDegree SMD>
|
||||
__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
|
||||
double2 *buffer) {
|
||||
extern __shared__ double2 sharedMemoryFFT[];
|
||||
double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
|
||||
: sharedMemoryFFT;
|
||||
int tid = threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = d_input[blockIdx.x * (params::degree / 2) + tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
__syncthreads();
|
||||
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* global batch polynomial multiplication
|
||||
* only used for fft tests
|
||||
* d_input1 and d_output must not have the same pointer
|
||||
* d_input1 can be modified inside the function
|
||||
*/
|
||||
template <class params, sharedMemDegree SMD>
|
||||
__global__ void batch_polynomial_mul(double2 *d_input1, double2 *d_input2,
|
||||
double2 *d_output, double2 *buffer) {
|
||||
extern __shared__ double2 sharedMemoryFFT[];
|
||||
double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
|
||||
: sharedMemoryFFT;
|
||||
|
||||
// Move first polynomial into shared memory(if possible otherwise it will
|
||||
// be moved in device buffer)
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = d_input1[blockIdx.x * (params::degree / 2) + tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
// Perform direct negacyclic fourier transform
|
||||
__syncthreads();
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
__syncthreads();
|
||||
|
||||
// Put the result of direct fft inside input1
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
d_input1[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Move first polynomial into shared memory(if possible otherwise it will
|
||||
// be moved in device buffer)
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = d_input2[blockIdx.x * (params::degree / 2) + tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
// Perform direct negacyclic fourier transform on the second polynomial
|
||||
__syncthreads();
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
__syncthreads();
|
||||
|
||||
// calculate pointwise multiplication inside fft buffer
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] *= d_input1[blockIdx.x * (params::degree / 2) + tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
// Perform backward negacyclic fourier transform
|
||||
__syncthreads();
|
||||
NSMFFT_inverse<HalfDegree<params>>(fft);
|
||||
__syncthreads();
|
||||
|
||||
// copy results in output buffer
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // GPU_BOOTSTRAP_FFT_CUH
|
||||
8197
backends/tfhe-cuda-backend/implementation/src/fft/twiddles.cu
Normal file
8197
backends/tfhe-cuda-backend/implementation/src/fft/twiddles.cu
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,13 @@
|
||||
#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
|
||||
#define GPU_BOOTSTRAP_TWIDDLES_CUH
|
||||
|
||||
/*
|
||||
* 'negtwiddles' are stored in constant memory for faster access times
|
||||
* because of it's limitied size, only twiddles for up to 2^12 polynomial size
|
||||
* can be stored there, twiddles for 2^13 are stored in device memory
|
||||
* 'negtwiddles13'
|
||||
*/
|
||||
|
||||
extern __constant__ double2 negtwiddles[4096];
|
||||
extern __device__ double2 negtwiddles13[4096];
|
||||
#endif
|
||||
@@ -0,0 +1,51 @@
|
||||
#include "integer/bitwise_ops.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_bitop_kb_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_radix_bitop_kb<uint64_t>(
|
||||
stream, (int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count,
|
||||
params, op_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
|
||||
void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t lwe_ciphertext_count) {
|
||||
|
||||
host_integer_radix_bitop_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2),
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
|
||||
lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
void cuda_bitnot_integer_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count) {
|
||||
|
||||
host_integer_radix_bitnot_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
|
||||
lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_bitop(cuda_stream_t *stream, int8_t **mem_ptr_void) {
|
||||
|
||||
int_bitop_buffer<uint64_t> *mem_ptr =
|
||||
(int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
#ifndef CUDA_INTEGER_BITWISE_OPS_CUH
|
||||
#define CUDA_INTEGER_BITWISE_OPS_CUH
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.cuh"
|
||||
#include "integer.h"
|
||||
#include "pbs/bootstrap_low_latency.cuh"
|
||||
#include "pbs/bootstrap_multibit.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_bitop_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_1, Torus *lwe_array_2,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto lut = mem_ptr->lut;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, lwe_array_1, lwe_array_2, bsk, ksk,
|
||||
num_radix_blocks, lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_bitnot_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_in,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto lut = mem_ptr->lut;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, lwe_array_in, bsk, ksk, num_radix_blocks, lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_bitop_kb(
|
||||
cuda_stream_t *stream, int_bitop_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_bitop_buffer<Torus>(stream, op, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,45 @@
|
||||
#include "integer/cmux.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_cmux_kb_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
std::function<uint64_t(uint64_t)> predicate_lut_f =
|
||||
[](uint64_t x) -> uint64_t { return x == 1; };
|
||||
|
||||
scratch_cuda_integer_radix_cmux_kb(
|
||||
stream, (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
|
||||
lwe_ciphertext_count, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_condition,
|
||||
void *lwe_array_true, void *lwe_array_false, int8_t *mem_ptr, void *bsk,
|
||||
void *ksk, uint32_t lwe_ciphertext_count) {
|
||||
|
||||
host_integer_radix_cmux_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_condition),
|
||||
static_cast<uint64_t *>(lwe_array_true),
|
||||
static_cast<uint64_t *>(lwe_array_false),
|
||||
(int_cmux_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
|
||||
|
||||
lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_cmux(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_cmux_buffer<uint64_t> *mem_ptr =
|
||||
(int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
100
backends/tfhe-cuda-backend/implementation/src/integer/cmux.cuh
Normal file
100
backends/tfhe-cuda-backend/implementation/src/integer/cmux.cuh
Normal file
@@ -0,0 +1,100 @@
|
||||
#ifndef CUDA_INTEGER_CMUX_CUH
|
||||
#define CUDA_INTEGER_CMUX_CUH
|
||||
|
||||
#include "integer.cuh"
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_input, Torus *lwe_condition,
|
||||
int_zero_out_if_buffer<Torus> *mem_ptr,
|
||||
int_radix_lut<Torus> *predicate, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
int big_lwe_size = params.big_lwe_dimension + 1;
|
||||
|
||||
// Left message is shifted
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (params.big_lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
|
||||
// We can't use integer_radix_apply_bivariate_lookup_table_kb since the
|
||||
// second operand is fixed
|
||||
auto tmp_lwe_array_input = mem_ptr->tmp;
|
||||
for (int i = 0; i < num_radix_blocks; i++) {
|
||||
auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
|
||||
auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
|
||||
|
||||
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0,
|
||||
stream->stream>>>(
|
||||
lwe_array_out_block, lwe_array_input_block, lwe_condition,
|
||||
predicate->lwe_indexes, params.big_lwe_dimension,
|
||||
params.message_modulus, 1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, tmp_lwe_array_input, bsk, ksk, num_radix_blocks,
|
||||
predicate);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_cmux_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_condition, Torus *lwe_array_true,
|
||||
Torus *lwe_array_false,
|
||||
int_cmux_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
// Since our CPU threads will be working on different streams we shall assert
|
||||
// the work in the main stream is completed
|
||||
stream->synchronize();
|
||||
auto true_stream = mem_ptr->zero_if_true_buffer->local_stream;
|
||||
auto false_stream = mem_ptr->zero_if_false_buffer->local_stream;
|
||||
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// Both sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
auto mem_true = mem_ptr->zero_if_true_buffer;
|
||||
zero_out_if(true_stream, mem_ptr->tmp_true_ct, lwe_array_true,
|
||||
lwe_condition, mem_true, mem_ptr->inverted_predicate_lut, bsk,
|
||||
ksk, num_radix_blocks);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
auto mem_false = mem_ptr->zero_if_false_buffer;
|
||||
zero_out_if(false_stream, mem_ptr->tmp_false_ct, lwe_array_false,
|
||||
lwe_condition, mem_false, mem_ptr->predicate_lut, bsk, ksk,
|
||||
num_radix_blocks);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(true_stream);
|
||||
cuda_synchronize_stream(false_stream);
|
||||
|
||||
// If the condition was true, true_ct will have kept its value and false_ct
|
||||
// will be 0 If the condition was false, true_ct will be 0 and false_ct will
|
||||
// have kept its value
|
||||
auto added_cts = mem_ptr->tmp_true_ct;
|
||||
host_addition(stream, added_cts, mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
|
||||
params.big_lwe_dimension, num_radix_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, added_cts, bsk, ksk, num_radix_blocks,
|
||||
mem_ptr->message_extract_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_cmux_kb(
|
||||
cuda_stream_t *stream, int_cmux_buffer<Torus> **mem_ptr,
|
||||
std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_cmux_buffer<Torus>(stream, predicate_lut_f, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,83 @@
|
||||
#include "integer/comparison.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_comparison_kb_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, COMPARISON_TYPE op_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
switch (op_type) {
|
||||
case EQ:
|
||||
case NE:
|
||||
scratch_cuda_integer_radix_equality_check_kb<uint64_t>(
|
||||
stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
|
||||
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
|
||||
break;
|
||||
case GT:
|
||||
case GE:
|
||||
case LT:
|
||||
case LE:
|
||||
case MAX:
|
||||
case MIN:
|
||||
scratch_cuda_integer_radix_difference_check_kb<uint64_t>(
|
||||
stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
|
||||
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
|
||||
void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t lwe_ciphertext_count) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
switch (buffer->op) {
|
||||
case EQ:
|
||||
case NE:
|
||||
host_integer_radix_equality_check_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
|
||||
static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
|
||||
break;
|
||||
case GT:
|
||||
case GE:
|
||||
case LT:
|
||||
case LE:
|
||||
host_integer_radix_difference_check_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2), buffer,
|
||||
buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
|
||||
lwe_ciphertext_count);
|
||||
break;
|
||||
case MAX:
|
||||
case MIN:
|
||||
host_integer_radix_maxmin_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
|
||||
static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
|
||||
break;
|
||||
default:
|
||||
printf("Not implemented\n");
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_comparison(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *mem_ptr =
|
||||
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
@@ -0,0 +1,468 @@
|
||||
#ifndef CUDA_INTEGER_COMPARISON_OPS_CUH
|
||||
#define CUDA_INTEGER_COMPARISON_OPS_CUH
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.cuh"
|
||||
#include "integer.h"
|
||||
#include "integer/cmux.cuh"
|
||||
#include "integer/negation.cuh"
|
||||
#include "integer/scalar_addition.cuh"
|
||||
#include "pbs/bootstrap_low_latency.cuh"
|
||||
#include "pbs/bootstrap_multibit.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
// lwe_dimension + 1 threads
|
||||
// todo: This kernel MUST be refactored to a binary reduction
|
||||
template <typename Torus>
|
||||
__global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t num_blocks) {
|
||||
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (idx < lwe_dimension + 1) {
|
||||
auto block = &input_block[idx];
|
||||
|
||||
Torus sum = block[0];
|
||||
for (int i = 1; i < num_blocks; i++) {
|
||||
sum += block[i * (lwe_dimension + 1)];
|
||||
}
|
||||
|
||||
output[idx] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
|
||||
Torus *input, uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
// Add all blocks and store in sum
|
||||
device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
|
||||
output, input, lwe_dimension, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
auto are_all_block_true_buffer =
|
||||
mem_ptr->eq_buffer->are_all_block_true_buffer;
|
||||
|
||||
uint32_t total_modulus = message_modulus * carry_modulus;
|
||||
uint32_t max_value = total_modulus - 1;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
lwe_array_out, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
|
||||
|
||||
int lut_num_blocks = 0;
|
||||
uint32_t remaining_blocks = num_radix_blocks;
|
||||
while (remaining_blocks > 1) {
|
||||
// Split in max_value chunks
|
||||
uint32_t chunk_length = std::min(max_value, remaining_blocks);
|
||||
int num_chunks = remaining_blocks / chunk_length;
|
||||
|
||||
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
|
||||
// as in the worst case we will be adding `max_value` ones
|
||||
auto input_blocks = lwe_array_out;
|
||||
auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
|
||||
for (int i = 0; i < num_chunks; i++) {
|
||||
accumulate_all_blocks(stream, accumulator, input_blocks,
|
||||
big_lwe_dimension, chunk_length);
|
||||
|
||||
accumulator += (big_lwe_dimension + 1);
|
||||
remaining_blocks -= (chunk_length - 1);
|
||||
input_blocks += (big_lwe_dimension + 1) * chunk_length;
|
||||
}
|
||||
accumulator = are_all_block_true_buffer->tmp_block_accumulated;
|
||||
|
||||
// Selects a LUT
|
||||
int_radix_lut<Torus> *lut;
|
||||
if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
|
||||
// is_non_zero_lut_buffer LUT
|
||||
lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
} else if (chunk_length == max_value) {
|
||||
// is_max_value LUT
|
||||
lut = are_all_block_true_buffer->is_max_value_lut;
|
||||
} else {
|
||||
// is_equal_to_num_blocks LUT
|
||||
lut = are_all_block_true_buffer->is_equal_to_num_blocks_lut;
|
||||
if (chunk_length != lut_num_blocks) {
|
||||
auto is_equal_to_num_blocks_lut_f = [max_value,
|
||||
chunk_length](Torus x) -> Torus {
|
||||
return (x & max_value) == chunk_length;
|
||||
};
|
||||
generate_device_accumulator<Torus>(
|
||||
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
|
||||
carry_modulus, is_equal_to_num_blocks_lut_f);
|
||||
|
||||
// We don't have to generate this lut again
|
||||
lut_num_blocks = chunk_length;
|
||||
}
|
||||
}
|
||||
|
||||
// Applies the LUT
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, accumulator, bsk, ksk, num_chunks, lut);
|
||||
}
|
||||
}
|
||||
|
||||
// This takes an input slice of blocks.
|
||||
//
|
||||
// Each block can encrypt any value as long as its < message_modulus.
|
||||
//
|
||||
// It will compare blocks with 0, for either equality or difference.
|
||||
//
|
||||
// This returns a Vec of block, where each block encrypts 1 or 0
|
||||
// depending of if all blocks matched with the comparison type with 0.
|
||||
//
|
||||
// E.g. For ZeroComparisonType::Equality, if all input blocks are zero
|
||||
// than all returned block will encrypt 1
|
||||
//
|
||||
// The returned Vec will have less block than the number of input blocks.
|
||||
// The returned blocks potentially needs to be 'reduced' to one block
|
||||
// with eg are_all_comparisons_block_true.
|
||||
//
|
||||
// This function exists because sometimes it is faster to concatenate
|
||||
// multiple vec of 'boolean' shortint block before reducing them with
|
||||
// are_all_comparisons_block_true
|
||||
template <typename Torus>
|
||||
__host__ void host_compare_with_zero_equality(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
int32_t num_radix_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// The idea is that we will sum chunks of blocks until carries are full
|
||||
// then we compare the sum with 0.
|
||||
//
|
||||
// If all blocks were 0, the sum will be zero
|
||||
// If at least one bock was not zero, the sum won't be zero
|
||||
uint32_t total_modulus = message_modulus * carry_modulus;
|
||||
uint32_t message_max = message_modulus - 1;
|
||||
|
||||
uint32_t num_elements_to_fill_carry = (total_modulus - 1) / message_max;
|
||||
|
||||
size_t big_lwe_size = big_lwe_dimension + 1;
|
||||
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
int num_sum_blocks = 0;
|
||||
// Accumulator
|
||||
auto sum = lwe_array_out;
|
||||
|
||||
if (num_radix_blocks == 1) {
|
||||
// Just copy
|
||||
cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes, stream);
|
||||
num_sum_blocks = 1;
|
||||
} else {
|
||||
uint32_t remainder_blocks = num_radix_blocks;
|
||||
|
||||
auto sum_i = sum;
|
||||
auto chunk = lwe_array_in;
|
||||
while (remainder_blocks > 1) {
|
||||
uint32_t chunk_size =
|
||||
std::min(remainder_blocks, num_elements_to_fill_carry);
|
||||
|
||||
accumulate_all_blocks(stream, sum_i, chunk, big_lwe_dimension,
|
||||
chunk_size);
|
||||
|
||||
num_sum_blocks++;
|
||||
remainder_blocks -= (chunk_size - 1);
|
||||
|
||||
// Update operands
|
||||
chunk += chunk_size * big_lwe_size;
|
||||
sum_i += big_lwe_size;
|
||||
}
|
||||
}
|
||||
|
||||
auto is_equal_to_zero_lut = mem_ptr->diff_buffer->is_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, sum, sum, bsk, ksk, num_sum_blocks, is_equal_to_zero_lut);
|
||||
are_all_comparisons_block_true(stream, lwe_array_out, sum, mem_ptr, bsk, ksk,
|
||||
num_sum_blocks);
|
||||
|
||||
// The result will be in the two first block. Everything else is
|
||||
// garbage.
|
||||
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
|
||||
big_lwe_size_bytes * (num_radix_blocks - 1), stream);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_equality_check_kb(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
|
||||
Torus *lwe_array_2, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto eq_buffer = mem_ptr->eq_buffer;
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
|
||||
// Applies the LUT for the comparison operation
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
stream, comparisons, lwe_array_1, lwe_array_2, bsk, ksk, num_radix_blocks,
|
||||
eq_buffer->operator_lut);
|
||||
|
||||
// This takes a Vec of blocks, where each block is either 0 or 1.
|
||||
//
|
||||
// It return a block encrypting 1 if all input blocks are 1
|
||||
// otherwise the block encrypts 0
|
||||
are_all_comparisons_block_true(stream, lwe_array_out, comparisons, mem_ptr,
|
||||
bsk, ksk, num_radix_blocks);
|
||||
|
||||
// Zero all blocks but the first
|
||||
size_t big_lwe_size = big_lwe_dimension + 1;
|
||||
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
|
||||
big_lwe_size_bytes * (num_radix_blocks - 1), stream);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_equality_check_kb(
|
||||
cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_comparison_buffer<Torus>(
|
||||
stream, op, params, num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_left, Torus *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// When rhs > lhs, the subtraction will overflow, and the bit of padding will
|
||||
// be set to 1
|
||||
// meaning that the output of the pbs will be the negative (modulo message
|
||||
// space)
|
||||
//
|
||||
// Example:
|
||||
// lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
|
||||
// lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
|
||||
// Since there was an overflow the bit of padding is 1 and not 0.
|
||||
// When applying the LUT for an input value of 14 we would expect 1,
|
||||
// but since the bit of padding is 1, we will get -1 modulus our message
|
||||
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
|
||||
|
||||
// Subtract
|
||||
// Here we need the true lwe sub, not the one that comes from shortint.
|
||||
host_subtraction(stream, lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
big_lwe_dimension, num_radix_blocks);
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
stream, lwe_array_out, lwe_array_out, bsk, ksk, num_radix_blocks,
|
||||
is_non_zero_lut);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
|
||||
big_lwe_dimension, num_radix_blocks,
|
||||
message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
// Reduces a vec containing shortint blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single shortint block containing the
|
||||
// final sign
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_block_comparisons,
|
||||
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto params = tree_buffer->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// Tree reduction
|
||||
// Reduces a vec containing shortint blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single shortint block containing the
|
||||
// final sign
|
||||
size_t big_lwe_size = big_lwe_dimension + 1;
|
||||
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
auto x = tree_buffer->tmp_x;
|
||||
auto y = tree_buffer->tmp_y;
|
||||
if (x != lwe_block_comparisons)
|
||||
cuda_memcpy_async_gpu_to_gpu(x, lwe_block_comparisons,
|
||||
big_lwe_size_bytes * num_radix_blocks, stream);
|
||||
|
||||
uint32_t partial_block_count = num_radix_blocks;
|
||||
|
||||
auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
|
||||
while (partial_block_count > 2) {
|
||||
pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, x, y, bsk, ksk, partial_block_count >> 1, inner_tree_leaf);
|
||||
|
||||
if ((partial_block_count % 2) != 0) {
|
||||
partial_block_count >>= 1;
|
||||
partial_block_count++;
|
||||
|
||||
auto last_y_block = y + (partial_block_count - 1) * big_lwe_size;
|
||||
auto last_x_block = x + (partial_block_count - 1) * big_lwe_size;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(last_x_block, last_y_block,
|
||||
big_lwe_size_bytes, stream);
|
||||
} else {
|
||||
partial_block_count >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
auto last_lut = tree_buffer->tree_last_leaf_lut;
|
||||
auto block_selector_f = tree_buffer->block_selector_f;
|
||||
std::function<Torus(Torus)> f;
|
||||
|
||||
if (partial_block_count == 2) {
|
||||
pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
|
||||
|
||||
f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
|
||||
int msb = (x >> 2) & 3;
|
||||
int lsb = x & 3;
|
||||
|
||||
int final_sign = block_selector_f(msb, lsb);
|
||||
return sign_handler_f(final_sign);
|
||||
};
|
||||
} else {
|
||||
// partial_block_count == 1
|
||||
y = x;
|
||||
f = sign_handler_f;
|
||||
}
|
||||
generate_device_accumulator<Torus>(stream, last_lut->lut, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, f);
|
||||
|
||||
// Last leaf
|
||||
integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out, y, bsk,
|
||||
ksk, 1, last_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_difference_check_kb(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_left,
|
||||
Torus *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> reduction_lut_f, void *bsk, Torus *ksk,
|
||||
uint32_t total_num_radix_blocks) {
|
||||
|
||||
auto diff_buffer = mem_ptr->diff_buffer;
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
uint32_t num_radix_blocks = total_num_radix_blocks;
|
||||
auto lhs = lwe_array_left;
|
||||
auto rhs = lwe_array_right;
|
||||
if (carry_modulus == message_modulus) {
|
||||
// Packing is possible
|
||||
// Pack inputs
|
||||
Torus *packed_left = diff_buffer->tmp_packed_left;
|
||||
Torus *packed_right = diff_buffer->tmp_packed_right;
|
||||
pack_blocks(stream, packed_left, lwe_array_left, big_lwe_dimension,
|
||||
num_radix_blocks, message_modulus);
|
||||
pack_blocks(stream, packed_right, lwe_array_right, big_lwe_dimension,
|
||||
num_radix_blocks, message_modulus);
|
||||
// From this point we have half number of blocks
|
||||
num_radix_blocks /= 2;
|
||||
|
||||
// Clean noise
|
||||
auto cleaning_lut = mem_ptr->cleaning_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
stream, packed_left, packed_left, bsk, ksk, num_radix_blocks,
|
||||
cleaning_lut);
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
stream, packed_right, packed_right, bsk, ksk, num_radix_blocks,
|
||||
cleaning_lut);
|
||||
|
||||
lhs = packed_left;
|
||||
rhs = packed_right;
|
||||
}
|
||||
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
|
||||
num_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(stream, lwe_array_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, reduction_lut_f, bsk,
|
||||
ksk, num_radix_blocks);
|
||||
|
||||
// The result will be in the first block. Everything else is garbage.
|
||||
size_t big_lwe_size = big_lwe_dimension + 1;
|
||||
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
|
||||
(total_num_radix_blocks - 1) * big_lwe_size_bytes, stream);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_difference_check_kb(
|
||||
cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_comparison_buffer<Torus>(
|
||||
stream, op, params, num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_maxmin_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_left, Torus *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t total_num_radix_blocks) {
|
||||
|
||||
// Compute the sign
|
||||
host_integer_radix_difference_check_kb(
|
||||
stream, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
mem_ptr, mem_ptr->cleaning_lut_f, bsk, ksk, total_num_radix_blocks);
|
||||
|
||||
// Selector
|
||||
host_integer_radix_cmux_kb(
|
||||
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
|
||||
}
|
||||
|
||||
#endif
|
||||
127
backends/tfhe-cuda-backend/implementation/src/integer/integer.cu
Normal file
127
backends/tfhe-cuda-backend/implementation/src/integer/integer.cu
Normal file
@@ -0,0 +1,127 @@
|
||||
#include "integer/integer.cuh"
|
||||
#include <linear_algebra.h>
|
||||
|
||||
void cuda_full_propagation_64_inplace(
|
||||
cuda_stream_t *stream, void *input_blocks, int8_t *mem_ptr, void *ksk,
|
||||
void *bsk, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t grouping_factor,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void scratch_cuda_full_propagation_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
scratch_cuda_full_propagation<uint64_t>(
|
||||
stream, (int_fullprop_buffer<uint64_t> **)mem_ptr, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count, grouping_factor,
|
||||
input_lwe_ciphertext_count, message_modulus, carry_modulus, pbs_type,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_full_propagation(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_fullprop_buffer<uint64_t> *mem_ptr =
|
||||
(int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
cuda_drop_async(mem_ptr->lut_buffer, stream);
|
||||
cuda_drop_async(mem_ptr->lut_indexes, stream);
|
||||
|
||||
cuda_drop_async(mem_ptr->pbs_buffer, stream);
|
||||
|
||||
cuda_drop_async(mem_ptr->tmp_small_lwe_vector, stream);
|
||||
cuda_drop_async(mem_ptr->tmp_big_lwe_vector, stream);
|
||||
}
|
||||
|
||||
void scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_propagate_single_carry_low_latency_kb_inplace(
|
||||
stream, (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_propagate_single_carry_low_latency_kb_64_inplace(
|
||||
cuda_stream_t *stream, void *lwe_array, int8_t *mem_ptr, void *bsk,
|
||||
void *ksk, uint32_t num_blocks) {
|
||||
host_propagate_single_carry_low_latency<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array),
|
||||
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsk,
|
||||
static_cast<uint64_t *>(ksk), num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_propagate_single_carry_low_latency(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_sc_prop_memory<uint64_t> *mem_ptr =
|
||||
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
@@ -0,0 +1,675 @@
|
||||
#ifndef CUDA_INTEGER_CUH
|
||||
#define CUDA_INTEGER_CUH
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "integer/scalar_addition.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "linearalgebra/addition.cuh"
|
||||
#include "pbs/bootstrap_low_latency.cuh"
|
||||
#include "pbs/bootstrap_multibit.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <functional>
|
||||
|
||||
template <typename Torus>
|
||||
void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, PBS_TYPE pbs_type) {
|
||||
if (sizeof(Torus) == sizeof(uint32_t)) {
|
||||
// 32 bits
|
||||
switch (pbs_type) {
|
||||
case MULTI_BIT:
|
||||
printf("multibit\n");
|
||||
printf("Error: 32-bit multibit PBS is not supported.\n");
|
||||
break;
|
||||
case LOW_LAT:
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case AMORTIZED:
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// 64 bits
|
||||
switch (pbs_type) {
|
||||
case MULTI_BIT:
|
||||
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count,
|
||||
input_lwe_ciphertext_count, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case LOW_LAT:
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case AMORTIZED:
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// function rotates right radix ciphertext with specific value
|
||||
// grid is one dimensional
|
||||
// blockIdx.x represents x_th block of radix ciphertext
|
||||
template <typename Torus>
|
||||
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src, uint32_t value,
|
||||
uint32_t blocks_count, uint32_t lwe_size) {
|
||||
value %= blocks_count;
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
size_t src_block_id = blockIdx.x;
|
||||
size_t dst_block_id = (src_block_id + value) % blocks_count;
|
||||
size_t stride = blockDim.x;
|
||||
|
||||
auto cur_src_block = &src[src_block_id * lwe_size];
|
||||
auto cur_dst_block = &dst[dst_block_id * lwe_size];
|
||||
|
||||
for (size_t i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst_block[i] = cur_src_block[i];
|
||||
}
|
||||
}
|
||||
|
||||
// function rotates left radix ciphertext with specific value
|
||||
// grid is one dimensional
|
||||
// blockIdx.x represents x_th block of radix ciphertext
|
||||
template <typename Torus>
|
||||
__global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value,
|
||||
uint32_t blocks_count, uint32_t lwe_size) {
|
||||
value %= blocks_count;
|
||||
size_t src_block_id = blockIdx.x;
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
size_t dst_block_id = (src_block_id >= value)
|
||||
? src_block_id - value
|
||||
: src_block_id - value + blocks_count;
|
||||
size_t stride = blockDim.x;
|
||||
|
||||
auto cur_src_block = &src[src_block_id * lwe_size];
|
||||
auto cur_dst_block = &dst[dst_block_id * lwe_size];
|
||||
|
||||
for (size_t i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst_block[i] = cur_src_block[i];
|
||||
}
|
||||
}
|
||||
|
||||
// polynomial_size threads
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_array_1,
|
||||
Torus *lwe_array_2, Torus *lwe_indexes,
|
||||
uint32_t lwe_dimension, uint32_t message_modulus,
|
||||
uint32_t num_blocks) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
if (tid < num_blocks * (lwe_dimension + 1)) {
|
||||
int block_id = tid / (lwe_dimension + 1);
|
||||
int coeff_id = tid % (lwe_dimension + 1);
|
||||
|
||||
int pos = lwe_indexes[block_id] * (lwe_dimension + 1) + coeff_id;
|
||||
lwe_array_out[pos] = lwe_array_1[pos] * message_modulus + lwe_array_2[pos];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void pack_bivariate_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_1, Torus *lwe_array_2,
|
||||
Torus *lwe_indexes, uint32_t lwe_dimension,
|
||||
uint32_t message_modulus,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
// Left message is shifted
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = num_radix_blocks * (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
|
||||
lwe_array_out, lwe_array_1, lwe_array_2, lwe_indexes, lwe_dimension,
|
||||
message_modulus, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
|
||||
// apply_lookup_table
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto small_lwe_dimension = params.small_lwe_dimension;
|
||||
auto ks_level = params.ks_level;
|
||||
auto ks_base_log = params.ks_base_log;
|
||||
auto pbs_level = params.pbs_level;
|
||||
auto pbs_base_log = params.pbs_base_log;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto grouping_factor = params.grouping_factor;
|
||||
|
||||
// Compute Keyswitch-PBS
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
stream, lut->tmp_lwe_after_ks, lut->lwe_indexes, lwe_array_in,
|
||||
lut->lwe_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
|
||||
execute_pbs(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
|
||||
lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes, bsk,
|
||||
lut->pbs_buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
|
||||
num_radix_blocks, 1, 0,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
|
||||
Torus *lwe_array_2, void *bsk, Torus *ksk, uint32_t num_radix_blocks,
|
||||
int_radix_lut<Torus> *lut) {
|
||||
// apply_lookup_table_bivariate
|
||||
|
||||
auto params = lut->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
|
||||
// Left message is shifted
|
||||
pack_bivariate_blocks(stream, lut->tmp_lwe_before_ks, lwe_array_1,
|
||||
lwe_array_2, lut->lwe_indexes, big_lwe_dimension,
|
||||
message_modulus, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// Apply LUT
|
||||
integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out,
|
||||
lut->tmp_lwe_before_ks, bsk,
|
||||
ksk, num_radix_blocks, lut);
|
||||
}
|
||||
|
||||
// Rotates the slice in-place such that the first mid elements of the slice move
|
||||
// to the end while the last array_length elements move to the front. After
|
||||
// calling rotate_left, the element previously at index mid will become the
|
||||
// first element in the slice.
|
||||
template <typename Torus>
|
||||
void rotate_left(Torus *buffer, int mid, uint32_t array_length) {
|
||||
mid = mid % array_length;
|
||||
|
||||
std::rotate(buffer, buffer + mid, buffer + array_length);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t message_modulus,
|
||||
uint32_t carry_modulus,
|
||||
std::function<Torus(Torus)> f) {
|
||||
|
||||
uint32_t modulus_sup = message_modulus * carry_modulus;
|
||||
uint32_t box_size = polynomial_size / modulus_sup;
|
||||
Torus delta = (1ul << 63) / modulus_sup;
|
||||
|
||||
memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));
|
||||
|
||||
auto body = &acc[glwe_dimension * polynomial_size];
|
||||
|
||||
// This accumulator extracts the carry bits
|
||||
for (int i = 0; i < modulus_sup; i++) {
|
||||
int index = i * box_size;
|
||||
for (int j = index; j < index + box_size; j++) {
|
||||
auto f_eval = f(i);
|
||||
body[j] = f_eval * delta;
|
||||
}
|
||||
}
|
||||
|
||||
int half_box_size = box_size / 2;
|
||||
|
||||
// Negate the first half_box_size coefficients
|
||||
for (int i = 0; i < half_box_size; i++) {
|
||||
body[i] = -body[i];
|
||||
}
|
||||
|
||||
rotate_left(body, half_box_size, polynomial_size);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t message_modulus,
|
||||
uint32_t carry_modulus,
|
||||
std::function<Torus(Torus, Torus)> f) {
|
||||
|
||||
Torus factor_u64 = message_modulus;
|
||||
auto wrapped_f = [factor_u64, message_modulus, f](Torus input) -> Torus {
|
||||
Torus lhs = (input / factor_u64) % message_modulus;
|
||||
Torus rhs = (input % factor_u64) % message_modulus;
|
||||
|
||||
return f(lhs, rhs);
|
||||
};
|
||||
|
||||
generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, wrapped_f);
|
||||
}
|
||||
|
||||
/*
|
||||
* generate bivariate accumulator for device pointer
|
||||
* v_stream - cuda stream
|
||||
* acc - device pointer for bivariate accumulator
|
||||
* ...
|
||||
* f - wrapping function with two Torus inputs
|
||||
*/
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator_bivariate(
|
||||
cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::function<Torus(Torus, Torus)> f) {
|
||||
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
|
||||
// fill bivariate accumulator
|
||||
generate_lookup_table_bivariate<Torus>(h_lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f);
|
||||
|
||||
// copy host lut and tvi to device
|
||||
cuda_memcpy_async_to_gpu(
|
||||
acc_bivariate, h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
|
||||
|
||||
cuda_synchronize_stream(stream);
|
||||
free(h_lut);
|
||||
}
|
||||
|
||||
/*
|
||||
* generate bivariate accumulator for device pointer
|
||||
* v_stream - cuda stream
|
||||
* acc - device pointer for accumulator
|
||||
* ...
|
||||
* f - evaluating function with one Torus input
|
||||
*/
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t message_modulus,
|
||||
uint32_t carry_modulus,
|
||||
std::function<Torus(Torus)> f) {
|
||||
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
|
||||
// fill accumulator
|
||||
generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f);
|
||||
|
||||
// copy host lut and tvi to device
|
||||
cuda_memcpy_async_to_gpu(
|
||||
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
stream);
|
||||
|
||||
cuda_synchronize_stream(stream);
|
||||
free(h_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_propagate_single_carry_low_latency_kb_inplace(
|
||||
cuda_stream_t *stream, int_sc_prop_memory<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_sc_prop_memory<Torus>(stream, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
|
||||
Torus *lwe_array,
|
||||
int_sc_prop_memory<Torus> *mem,
|
||||
void *bsk, Torus *ksk,
|
||||
uint32_t num_blocks) {
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
auto generates_or_propagates = mem->generates_or_propagates;
|
||||
auto step_output = mem->step_output;
|
||||
|
||||
auto test_vector_array = mem->test_vector_array;
|
||||
auto lut_carry_propagation_sum = mem->lut_carry_propagation_sum;
|
||||
auto message_acc = mem->message_acc;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
|
||||
test_vector_array);
|
||||
|
||||
// compute prefix sum with hillis&steele
|
||||
|
||||
int num_steps = ceil(log2((double)num_blocks));
|
||||
int space = 1;
|
||||
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
|
||||
big_lwe_size_bytes * num_blocks, stream);
|
||||
|
||||
for (int step = 0; step < num_steps; step++) {
|
||||
auto cur_blocks = &step_output[space * big_lwe_size];
|
||||
auto prev_blocks = generates_or_propagates;
|
||||
int cur_total_blocks = num_blocks - space;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
|
||||
lut_carry_propagation_sum);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
|
||||
cur_blocks,
|
||||
big_lwe_size_bytes * cur_total_blocks, stream);
|
||||
space *= 2;
|
||||
}
|
||||
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
step_output, generates_or_propagates, 1, num_blocks, big_lwe_size);
|
||||
cuda_memset_async(step_output, 0, big_lwe_size_bytes, stream);
|
||||
|
||||
host_addition(stream, lwe_array, lwe_array, step_output,
|
||||
glwe_dimension * polynomial_size, num_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array, lwe_array, bsk, ksk, num_blocks, message_acc);
|
||||
}
|
||||
|
||||
/*
|
||||
* input_blocks: input radix ciphertext propagation will happen inplace
|
||||
* acc_message_carry: list of two lut s, [(message_acc), (carry_acc)]
|
||||
* tvi_message_carry: tvi for message and carry, should always be {0, 1}
|
||||
* small_lwe_vector: output of keyswitch should have
|
||||
* size = 2 * (lwe_dimension + 1) * sizeof(Torus)
|
||||
* big_lwe_vector: output of pbs should have
|
||||
* size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
|
||||
*/
|
||||
template <typename Torus, typename STorus, class params>
|
||||
void host_full_propagate_inplace(cuda_stream_t *stream, Torus *input_blocks,
|
||||
int_fullprop_buffer<Torus> *mem_ptr,
|
||||
Torus *ksk, void *bsk, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_base_log,
|
||||
uint32_t ks_level, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t grouping_factor,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
int big_lwe_size = (glwe_dimension * polynomial_size + 1);
|
||||
int small_lwe_size = (lwe_dimension + 1);
|
||||
|
||||
for (int i = 0; i < num_blocks; i++) {
|
||||
auto cur_input_block = &input_blocks[i * big_lwe_size];
|
||||
|
||||
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
|
||||
stream, mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes,
|
||||
cur_input_block, mem_ptr->lwe_indexes, ksk,
|
||||
polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
|
||||
1);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
|
||||
mem_ptr->tmp_small_lwe_vector,
|
||||
small_lwe_size * sizeof(Torus), stream);
|
||||
|
||||
execute_pbs<Torus>(
|
||||
stream, mem_ptr->tmp_big_lwe_vector, mem_ptr->lwe_indexes,
|
||||
mem_ptr->lut_buffer, mem_ptr->lut_indexes,
|
||||
mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes, bsk,
|
||||
mem_ptr->pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, grouping_factor, 2, 2, 0,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), mem_ptr->pbs_type);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
|
||||
big_lwe_size * sizeof(Torus), stream);
|
||||
|
||||
if (i < num_blocks - 1) {
|
||||
auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
|
||||
host_addition(stream, next_input_block, next_input_block,
|
||||
&mem_ptr->tmp_big_lwe_vector[big_lwe_size],
|
||||
glwe_dimension * polynomial_size, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_full_propagation(
|
||||
cuda_stream_t *stream, int_fullprop_buffer<Torus> **mem_ptr,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t pbs_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
// PBS
|
||||
int8_t *pbs_buffer;
|
||||
if (pbs_type == MULTI_BIT) {
|
||||
uint32_t lwe_chunk_size =
|
||||
get_average_lwe_chunk_size(lwe_dimension, pbs_level, glwe_dimension);
|
||||
// Only 64 bits is supported
|
||||
scratch_cuda_multi_bit_pbs_64(stream, &pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_level,
|
||||
grouping_factor, num_radix_blocks,
|
||||
cuda_get_max_shared_memory(stream->gpu_index),
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
// Classic
|
||||
// We only use low latency for classic mode
|
||||
if (sizeof(Torus) == sizeof(uint32_t))
|
||||
scratch_cuda_bootstrap_low_latency_32(
|
||||
stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
|
||||
num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
|
||||
allocate_gpu_memory);
|
||||
else
|
||||
scratch_cuda_bootstrap_low_latency_64(
|
||||
stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
|
||||
num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
// LUT
|
||||
Torus *lut_buffer;
|
||||
if (allocate_gpu_memory) {
|
||||
// LUT is used as a trivial encryption, so we only allocate memory for the
|
||||
// body
|
||||
Torus lut_buffer_size =
|
||||
2 * (glwe_dimension + 1) * polynomial_size * sizeof(Torus);
|
||||
|
||||
lut_buffer = (Torus *)cuda_malloc_async(lut_buffer_size, stream);
|
||||
|
||||
// LUTs
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
//
|
||||
Torus *lut_buffer_message = lut_buffer;
|
||||
Torus *lut_buffer_carry =
|
||||
lut_buffer + (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
stream, lut_buffer_message, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, lut_f_message);
|
||||
|
||||
generate_device_accumulator<Torus>(stream, lut_buffer_carry, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, lut_f_carry);
|
||||
}
|
||||
|
||||
Torus *lut_indexes;
|
||||
if (allocate_gpu_memory) {
|
||||
lut_indexes = (Torus *)cuda_malloc_async(2 * sizeof(Torus), stream);
|
||||
|
||||
Torus h_lut_indexes[2] = {0, 1};
|
||||
cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, 2 * sizeof(Torus),
|
||||
stream);
|
||||
}
|
||||
|
||||
Torus *lwe_indexes;
|
||||
if (allocate_gpu_memory) {
|
||||
Torus lwe_indexes_size = num_radix_blocks * sizeof(Torus);
|
||||
|
||||
lwe_indexes = (Torus *)cuda_malloc_async(lwe_indexes_size, stream);
|
||||
Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
h_lwe_indexes[i] = i;
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
|
||||
stream);
|
||||
cuda_synchronize_stream(stream);
|
||||
free(h_lwe_indexes);
|
||||
}
|
||||
|
||||
// Temporary arrays
|
||||
Torus *small_lwe_vector;
|
||||
Torus *big_lwe_vector;
|
||||
if (allocate_gpu_memory) {
|
||||
Torus small_vector_size = 2 * (lwe_dimension + 1) * sizeof(Torus);
|
||||
Torus big_vector_size =
|
||||
2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus);
|
||||
|
||||
small_lwe_vector = (Torus *)cuda_malloc_async(small_vector_size, stream);
|
||||
big_lwe_vector = (Torus *)cuda_malloc_async(big_vector_size, stream);
|
||||
}
|
||||
|
||||
*mem_ptr = new int_fullprop_buffer<Torus>;
|
||||
|
||||
(*mem_ptr)->pbs_type = pbs_type;
|
||||
(*mem_ptr)->pbs_buffer = pbs_buffer;
|
||||
|
||||
(*mem_ptr)->lut_buffer = lut_buffer;
|
||||
(*mem_ptr)->lut_indexes = lut_indexes;
|
||||
(*mem_ptr)->lwe_indexes = lwe_indexes;
|
||||
|
||||
(*mem_ptr)->tmp_small_lwe_vector = small_lwe_vector;
|
||||
(*mem_ptr)->tmp_big_lwe_vector = big_lwe_vector;
|
||||
}
|
||||
|
||||
// (lwe_dimension+1) threads
|
||||
// (num_radix_blocks / 2) thread blocks
|
||||
template <typename Torus>
|
||||
__global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks, uint32_t factor) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
if (tid < (lwe_dimension + 1)) {
|
||||
for (int bid = 0; bid < (num_radix_blocks / 2); bid++) {
|
||||
Torus *lsb_block = lwe_array_in + (2 * bid) * (lwe_dimension + 1);
|
||||
Torus *msb_block = lsb_block + (lwe_dimension + 1);
|
||||
|
||||
Torus *packed_block = lwe_array_out + bid * (lwe_dimension + 1);
|
||||
|
||||
packed_block[tid] = lsb_block[tid] + factor * msb_block[tid];
|
||||
}
|
||||
|
||||
if (num_radix_blocks % 2 != 0) {
|
||||
// We couldn't pack the last block, so we just copy it
|
||||
Torus *lsb_block =
|
||||
lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
|
||||
Torus *last_block =
|
||||
lwe_array_out + (num_radix_blocks / 2) * (lwe_dimension + 1);
|
||||
|
||||
last_block[tid] = lsb_block[tid];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Packs the low ciphertext in the message parts of the high ciphertext
|
||||
// and moves the high ciphertext into the carry part.
|
||||
//
|
||||
// This requires the block parameters to have enough room for two ciphertexts,
|
||||
// so at least as many carry modulus as the message modulus
|
||||
//
|
||||
// Expects the carry buffer to be empty
|
||||
template <typename Torus>
|
||||
__host__ void pack_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_in, uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks, uint32_t factor) {
|
||||
assert(lwe_array_out != lwe_array_in);
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
device_pack_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
|
||||
lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input,
|
||||
int32_t num_blocks, uint32_t lwe_dimension,
|
||||
uint64_t delta) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num_blocks) {
|
||||
Torus scalar = scalar_input[tid];
|
||||
Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
|
||||
|
||||
*body = scalar * delta;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
create_trivial_radix(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *scalar_array, uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks,
|
||||
uint64_t message_modulus, uint64_t carry_modulus) {
|
||||
|
||||
size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
|
||||
cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream);
|
||||
|
||||
if (num_scalar_blocks == 0)
|
||||
return;
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = num_scalar_blocks;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
// If message_modulus and carry_modulus are always powers of 2 we can simplify
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_create_trivial_radix<<<grid, thds, 0, stream->stream>>>(
|
||||
lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_INTERNAL_INTEGER_CUH
|
||||
@@ -0,0 +1,107 @@
|
||||
#include "integer/multiplication.cuh"
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the integer radix multiplication in keyswitch->bootstrap order.
|
||||
*/
|
||||
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
|
||||
uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, PBS_TYPE pbs_type, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
polynomial_size, lwe_dimension, ks_level, ks_base_log,
|
||||
pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 2048:
|
||||
scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
|
||||
stream, (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Computes a multiplication between two 64 bit radix lwe ciphertexts
|
||||
* encrypting integer values. keyswitch -> bootstrap pattern is used, function
|
||||
* works for single pair of radix ciphertexts, 'v_stream' can be used for
|
||||
* parallelization
|
||||
* - 'v_stream' is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - 'gpu_index' is the index of the GPU to be used in the kernel launch
|
||||
* - 'radix_lwe_out' is 64 bit radix big lwe ciphertext, product of
|
||||
* multiplication
|
||||
* - 'radix_lwe_left' left radix big lwe ciphertext
|
||||
* - 'radix_lwe_right' right radix big lwe ciphertext
|
||||
* - 'bsk' bootstrapping key in fourier domain
|
||||
* - 'ksk' keyswitching key
|
||||
* - 'mem_ptr'
|
||||
* - 'message_modulus' message_modulus
|
||||
* - 'carry_modulus' carry_modulus
|
||||
* - 'glwe_dimension' glwe_dimension
|
||||
* - 'lwe_dimension' is the dimension of small lwe ciphertext
|
||||
* - 'polynomial_size' polynomial size
|
||||
* - 'pbs_base_log' base log used in the pbs
|
||||
* - 'pbs_level' decomposition level count used in the pbs
|
||||
* - 'ks_level' decomposition level count used in the keyswitch
|
||||
* - 'num_blocks' is the number of big lwe ciphertext blocks inside radix
|
||||
* ciphertext
|
||||
* - 'pbs_type' selects which PBS implementation should be used
|
||||
* - 'max_shared_memory' maximum shared memory per cuda block
|
||||
*/
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_left,
|
||||
void *radix_lwe_right, void *bsk, void *ksk, int8_t *mem_ptr,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 2048:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_mult(cuda_stream_t *stream, int8_t **mem_ptr_void) {
|
||||
|
||||
int_mul_memory<uint64_t> *mem_ptr =
|
||||
(int_mul_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
void cuda_small_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
cuda_stream_t *stream, void *lwe_array, uint64_t scalar,
|
||||
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
|
||||
|
||||
cuda_small_scalar_multiplication_integer_radix_ciphertext_64(
|
||||
stream, lwe_array, lwe_array, scalar, lwe_dimension,
|
||||
lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
void cuda_small_scalar_multiplication_integer_radix_ciphertext_64(
|
||||
cuda_stream_t *stream, void *output_lwe_array, void *input_lwe_array,
|
||||
uint64_t scalar, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
|
||||
|
||||
host_integer_small_scalar_mult_radix(
|
||||
stream, static_cast<uint64_t *>(output_lwe_array),
|
||||
static_cast<uint64_t *>(input_lwe_array), scalar, lwe_dimension,
|
||||
lwe_ciphertext_count);
|
||||
}
|
||||
@@ -0,0 +1,639 @@
|
||||
#ifndef CUDA_INTEGER_MULT_CUH
|
||||
#define CUDA_INTEGER_MULT_CUH
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "bootstrap_multibit.h"
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "integer/integer.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "pbs/bootstrap_amortized.cuh"
|
||||
#include "pbs/bootstrap_low_latency.cuh"
|
||||
#include "pbs/bootstrap_multibit.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void
|
||||
all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
|
||||
Torus *msb_ciphertext, Torus *radix_lwe_right,
|
||||
Torus *lsb_rhs, Torus *msb_rhs, int num_blocks) {
|
||||
|
||||
size_t block_id = blockIdx.x;
|
||||
double D = sqrt((2 * num_blocks + 1) * (2 * num_blocks + 1) - 8 * block_id);
|
||||
size_t radix_id = int((2 * num_blocks + 1 - D) / 2.);
|
||||
size_t local_block_id =
|
||||
block_id - (2 * num_blocks - radix_id + 1) / 2. * radix_id;
|
||||
bool process_msb = (local_block_id < (num_blocks - radix_id - 1));
|
||||
auto cur_lsb_block = &lsb_ciphertext[block_id * (params::degree + 1)];
|
||||
auto cur_msb_block =
|
||||
(process_msb)
|
||||
? &msb_ciphertext[(block_id - radix_id) * (params::degree + 1)]
|
||||
: nullptr;
|
||||
|
||||
auto cur_lsb_rhs_block = &lsb_rhs[block_id * (params::degree + 1)];
|
||||
auto cur_msb_rhs_block =
|
||||
(process_msb) ? &msb_rhs[(block_id - radix_id) * (params::degree + 1)]
|
||||
: nullptr;
|
||||
|
||||
auto cur_ct_right = &radix_lwe_right[radix_id * (params::degree + 1)];
|
||||
auto cur_src = &radix_lwe_left[local_block_id * (params::degree + 1)];
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
Torus value = cur_src[tid];
|
||||
if (process_msb) {
|
||||
cur_lsb_block[tid] = cur_msb_block[tid] = value;
|
||||
cur_lsb_rhs_block[tid] = cur_msb_rhs_block[tid] = cur_ct_right[tid];
|
||||
} else {
|
||||
cur_lsb_block[tid] = value;
|
||||
cur_lsb_rhs_block[tid] = cur_ct_right[tid];
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
Torus value = cur_src[params::degree];
|
||||
if (process_msb) {
|
||||
cur_lsb_block[params::degree] = cur_msb_block[params::degree] = value;
|
||||
cur_lsb_rhs_block[params::degree] = cur_msb_rhs_block[params::degree] =
|
||||
cur_ct_right[params::degree];
|
||||
} else {
|
||||
cur_lsb_block[params::degree] = value;
|
||||
cur_lsb_rhs_block[params::degree] = cur_ct_right[params::degree];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void compress_device_array_with_map(cuda_stream_t *stream, Torus *src,
|
||||
Torus *dst, int *S, int *F, int num_blocks,
|
||||
uint32_t map_size, uint32_t unit_size,
|
||||
int &total_copied, bool is_message) {
|
||||
for (int i = 0; i < map_size; i++) {
|
||||
int s_index = i * num_blocks + S[i];
|
||||
int number_of_unit = F[i] - S[i] + is_message;
|
||||
auto cur_dst = &dst[total_copied * unit_size];
|
||||
auto cur_src = &src[s_index * unit_size];
|
||||
size_t copy_size = unit_size * number_of_unit * sizeof(Torus);
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
|
||||
total_copied += number_of_unit;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void extract_message_carry_to_full_radix(cuda_stream_t *stream, Torus *src,
|
||||
Torus *dst, int *S, int *F,
|
||||
uint32_t map_size, uint32_t unit_size,
|
||||
int &total_copied,
|
||||
int &total_radix_copied,
|
||||
int num_blocks, bool is_message) {
|
||||
size_t radix_size = unit_size * num_blocks;
|
||||
for (int i = 0; i < map_size; i++) {
|
||||
auto cur_dst_radix = &dst[total_radix_copied * radix_size];
|
||||
|
||||
int s_index = S[i];
|
||||
int number_of_unit = F[i] - s_index + is_message;
|
||||
|
||||
if (!is_message) {
|
||||
int zero_block_count = num_blocks - number_of_unit;
|
||||
cuda_memset_async(cur_dst_radix, 0,
|
||||
zero_block_count * unit_size * sizeof(Torus), stream);
|
||||
s_index = zero_block_count;
|
||||
}
|
||||
|
||||
auto cur_dst = &cur_dst_radix[s_index * unit_size];
|
||||
auto cur_src = &src[total_copied * unit_size];
|
||||
|
||||
size_t copy_size = unit_size * number_of_unit * sizeof(Torus);
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
|
||||
total_copied += number_of_unit;
|
||||
++total_radix_copied;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
uint32_t chunk_size, uint32_t num_blocks) {
|
||||
|
||||
extern __shared__ Torus result[];
|
||||
size_t chunk_id = blockIdx.x;
|
||||
size_t chunk_elem_size = chunk_size * num_blocks * (params::degree + 1);
|
||||
size_t radix_elem_size = num_blocks * (params::degree + 1);
|
||||
auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
|
||||
auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
|
||||
size_t block_stride = blockIdx.y * (params::degree + 1);
|
||||
auto dst_block = &dst_radix[block_stride];
|
||||
|
||||
// init shared mem with first radix of chunk
|
||||
size_t tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
result[tid] = src_chunk[block_stride + tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
result[params::degree] = src_chunk[block_stride + params::degree];
|
||||
}
|
||||
|
||||
// accumulate rest of the radixes
|
||||
for (int r_id = 1; r_id < chunk_size; r_id++) {
|
||||
auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
result[tid] += cur_src_radix[block_stride + tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
result[params::degree] += cur_src_radix[block_stride + params::degree];
|
||||
}
|
||||
}
|
||||
|
||||
// put result from shared mem to global mem
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
dst_block[tid] = result[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
dst_block[params::degree] = result[params::degree];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
Torus *msb_blocks,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t lsb_count, uint32_t msb_count,
|
||||
uint32_t num_blocks) {
|
||||
size_t big_lwe_dimension = glwe_dimension * params::degree + 1;
|
||||
size_t big_lwe_id = blockIdx.x;
|
||||
size_t radix_id = big_lwe_id / num_blocks;
|
||||
size_t block_id = big_lwe_id % num_blocks;
|
||||
size_t lsb_block_id = block_id - radix_id;
|
||||
size_t msb_block_id = block_id - radix_id - 1;
|
||||
|
||||
bool process_lsb = (radix_id <= block_id);
|
||||
bool process_msb = (radix_id + 1 <= block_id);
|
||||
|
||||
auto cur_res_lsb_ct = &result_blocks[big_lwe_id * big_lwe_dimension];
|
||||
auto cur_res_msb_ct =
|
||||
&result_blocks[num_blocks * num_blocks * big_lwe_dimension +
|
||||
big_lwe_id * big_lwe_dimension];
|
||||
Torus *cur_lsb_radix = &lsb_blocks[(2 * num_blocks - radix_id + 1) *
|
||||
radix_id / 2 * (params::degree + 1)];
|
||||
Torus *cur_msb_radix = (process_msb)
|
||||
? &msb_blocks[(2 * num_blocks - radix_id - 1) *
|
||||
radix_id / 2 * (params::degree + 1)]
|
||||
: nullptr;
|
||||
Torus *cur_lsb_ct = (process_lsb)
|
||||
? &cur_lsb_radix[lsb_block_id * (params::degree + 1)]
|
||||
: nullptr;
|
||||
Torus *cur_msb_ct = (process_msb)
|
||||
? &cur_msb_radix[msb_block_id * (params::degree + 1)]
|
||||
: nullptr;
|
||||
size_t tid = threadIdx.x;
|
||||
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
cur_res_lsb_ct[tid] = (process_lsb) ? cur_lsb_ct[tid] : 0;
|
||||
cur_res_msb_ct[tid] = (process_msb) ? cur_msb_ct[tid] : 0;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
cur_res_lsb_ct[params::degree] =
|
||||
(process_lsb) ? cur_lsb_ct[params::degree] : 0;
|
||||
cur_res_msb_ct[params::degree] =
|
||||
(process_msb) ? cur_msb_ct[params::degree] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
__host__ void host_integer_mult_radix_kb(
|
||||
cuda_stream_t *stream, uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
|
||||
uint64_t *radix_lwe_right, void *bsk, uint64_t *ksk,
|
||||
int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
|
||||
|
||||
auto glwe_dimension = mem_ptr->params.glwe_dimension;
|
||||
auto polynomial_size = mem_ptr->params.polynomial_size;
|
||||
auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
|
||||
auto message_modulus = mem_ptr->params.message_modulus;
|
||||
auto carry_modulus = mem_ptr->params.carry_modulus;
|
||||
|
||||
int big_lwe_dimension = glwe_dimension * polynomial_size;
|
||||
int big_lwe_size = big_lwe_dimension + 1;
|
||||
|
||||
// 'vector_result_lsb' contains blocks from all possible right shifts of
|
||||
// radix_lwe_left, only nonzero blocks are kept
|
||||
int lsb_vector_block_count = num_blocks * (num_blocks + 1) / 2;
|
||||
|
||||
// 'vector_result_msb' contains blocks from all possible shifts of
|
||||
// radix_lwe_left except the last blocks of each shift. Only nonzero blocks
|
||||
// are kept
|
||||
int msb_vector_block_count = num_blocks * (num_blocks - 1) / 2;
|
||||
|
||||
// total number of blocks msb and lsb
|
||||
int total_block_count = lsb_vector_block_count + msb_vector_block_count;
|
||||
|
||||
// buffer to keep all lsb and msb shifts
|
||||
// for lsb all nonzero blocks of each right shifts are kept
|
||||
// for 0 shift num_blocks blocks
|
||||
// for 1 shift num_blocks - 1 blocks
|
||||
// for num_blocks - 1 shift 1 block
|
||||
// (num_blocks + 1) * num_blocks / 2 blocks
|
||||
// for msb we don't keep track for last blocks so
|
||||
// for 0 shift num_blocks - 1 blocks
|
||||
// for 1 shift num_blocks - 2 blocks
|
||||
// for num_blocks - 1 shift 0 blocks
|
||||
// (num_blocks - 1) * num_blocks / 2 blocks
|
||||
// in total num_blocks^2 blocks
|
||||
// in each block three is big polynomial with
|
||||
// glwe_dimension * polynomial_size + 1 coefficients
|
||||
auto vector_result_sb = mem_ptr->vector_result_sb;
|
||||
|
||||
// buffer to keep lsb_vector + msb_vector
|
||||
// addition will happen in full terms so there will be
|
||||
// num_blocks terms and each term will have num_blocks block
|
||||
// num_blocks^2 blocks in total
|
||||
// and each blocks has big lwe ciphertext with
|
||||
// glwe_dimension * polynomial_size + 1 coefficients
|
||||
auto block_mul_res = mem_ptr->block_mul_res;
|
||||
|
||||
// buffer to keep keyswitch result of num_blocks^2 ciphertext
|
||||
// in total it has num_blocks^2 small lwe ciphertexts with
|
||||
// lwe_dimension +1 coefficients
|
||||
auto small_lwe_vector = mem_ptr->small_lwe_vector;
|
||||
|
||||
// buffer to keep pbs result for num_blocks^2 lwe_ciphertext
|
||||
// in total it has num_blocks^2 big lwe ciphertexts with
|
||||
// glwe_dimension * polynomial_size + 1 coefficients
|
||||
auto lwe_pbs_out_array = mem_ptr->lwe_pbs_out_array;
|
||||
|
||||
// it contains two test vector, first for lsb extraction,
|
||||
// second for msb extraction, with total length =
|
||||
// 2 * (glwe_dimension + 1) * polynomial_size
|
||||
auto test_vector_array = mem_ptr->test_vector_array;
|
||||
|
||||
// accumulator to extract message
|
||||
// with length (glwe_dimension + 1) * polynomial_size
|
||||
auto test_vector_message = mem_ptr->test_vector_message;
|
||||
|
||||
// accumulator to extract carry
|
||||
// with length (glwe_dimension + 1) * polynomial_size
|
||||
auto test_vector_carry = mem_ptr->test_vector_carry;
|
||||
|
||||
// to be used as default indexing
|
||||
auto lwe_indexes = test_vector_array->lwe_indexes;
|
||||
|
||||
auto vector_result_lsb = &vector_result_sb[0];
|
||||
auto vector_result_msb =
|
||||
&vector_result_sb[lsb_vector_block_count *
|
||||
(polynomial_size * glwe_dimension + 1)];
|
||||
|
||||
auto vector_lsb_rhs = &block_mul_res[0];
|
||||
auto vector_msb_rhs = &block_mul_res[lsb_vector_block_count *
|
||||
(polynomial_size * glwe_dimension + 1)];
|
||||
|
||||
dim3 grid(lsb_vector_block_count, 1, 1);
|
||||
dim3 thds(params::degree / params::opt, 1, 1);
|
||||
|
||||
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, stream->stream>>>(
|
||||
radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
|
||||
vector_lsb_rhs, vector_msb_rhs, num_blocks);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, block_mul_res, block_mul_res, vector_result_sb, bsk, ksk,
|
||||
total_block_count, test_vector_array);
|
||||
|
||||
vector_result_lsb = &block_mul_res[0];
|
||||
vector_result_msb = &block_mul_res[lsb_vector_block_count *
|
||||
(polynomial_size * glwe_dimension + 1)];
|
||||
|
||||
fill_radix_from_lsb_msb<Torus, params>
|
||||
<<<num_blocks * num_blocks, params::degree / params::opt, 0,
|
||||
stream->stream>>>(vector_result_sb, vector_result_lsb,
|
||||
vector_result_msb, glwe_dimension,
|
||||
lsb_vector_block_count, msb_vector_block_count,
|
||||
num_blocks);
|
||||
|
||||
auto new_blocks = block_mul_res;
|
||||
auto old_blocks = vector_result_sb;
|
||||
|
||||
// amount of current radixes after block_mul
|
||||
size_t r = 2 * num_blocks;
|
||||
|
||||
size_t total_modulus = message_modulus * carry_modulus;
|
||||
size_t message_max = message_modulus - 1;
|
||||
size_t chunk_size = (total_modulus - 1) / message_max;
|
||||
size_t ch_amount = r / chunk_size;
|
||||
|
||||
int terms_degree[r * num_blocks];
|
||||
int f_b[ch_amount];
|
||||
int l_b[ch_amount];
|
||||
|
||||
for (int i = 0; i < num_blocks * num_blocks; i++) {
|
||||
size_t r_id = i / num_blocks;
|
||||
size_t b_id = i % num_blocks;
|
||||
terms_degree[i] = (b_id >= r_id) ? 3 : 0;
|
||||
}
|
||||
auto terms_degree_msb = &terms_degree[num_blocks * num_blocks];
|
||||
for (int i = 0; i < num_blocks * num_blocks; i++) {
|
||||
size_t r_id = i / num_blocks;
|
||||
size_t b_id = i % num_blocks;
|
||||
terms_degree_msb[i] = (b_id > r_id) ? 2 : 0;
|
||||
}
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
|
||||
while (r > chunk_size) {
|
||||
int cur_total_blocks = r * num_blocks;
|
||||
ch_amount = r / chunk_size;
|
||||
dim3 add_grid(ch_amount, num_blocks, 1);
|
||||
size_t sm_size = big_lwe_size * sizeof(Torus);
|
||||
cuda_memset_async(new_blocks, 0,
|
||||
ch_amount * num_blocks * big_lwe_size * sizeof(Torus),
|
||||
stream);
|
||||
|
||||
tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
|
||||
new_blocks, old_blocks, chunk_size, num_blocks);
|
||||
|
||||
for (int c_id = 0; c_id < ch_amount; c_id++) {
|
||||
auto cur_chunk = &terms_degree[c_id * chunk_size * num_blocks];
|
||||
int mx = 0;
|
||||
int mn = num_blocks;
|
||||
for (int r_id = 1; r_id < chunk_size; r_id++) {
|
||||
auto cur_radix = &cur_chunk[r_id * num_blocks];
|
||||
for (int i = 0; i < num_blocks; i++) {
|
||||
if (cur_radix[i]) {
|
||||
mn = min(mn, i);
|
||||
mx = max(mx, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
f_b[c_id] = mn;
|
||||
l_b[c_id] = mx;
|
||||
}
|
||||
|
||||
int total_copied = 0;
|
||||
int message_count = 0;
|
||||
int carry_count = 0;
|
||||
compress_device_array_with_map<Torus>(stream, new_blocks, old_blocks, f_b,
|
||||
l_b, num_blocks, ch_amount,
|
||||
big_lwe_size, total_copied, true);
|
||||
|
||||
message_count = total_copied;
|
||||
compress_device_array_with_map<Torus>(stream, new_blocks, old_blocks, f_b,
|
||||
l_b, num_blocks, ch_amount,
|
||||
big_lwe_size, total_copied, false);
|
||||
carry_count = total_copied - message_count;
|
||||
|
||||
auto message_blocks_vector = old_blocks;
|
||||
auto carry_blocks_vector =
|
||||
&old_blocks[message_count * (glwe_dimension * polynomial_size + 1)];
|
||||
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
stream, small_lwe_vector, lwe_indexes, old_blocks, lwe_indexes, ksk,
|
||||
polynomial_size * glwe_dimension, lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_copied);
|
||||
|
||||
execute_pbs<Torus>(
|
||||
stream, message_blocks_vector, lwe_indexes, test_vector_message->lut,
|
||||
test_vector_message->lut_indexes, small_lwe_vector, lwe_indexes, bsk,
|
||||
test_vector_message->pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
message_count, 1, 0, max_shared_memory, mem_ptr->params.pbs_type);
|
||||
|
||||
execute_pbs<Torus>(stream, carry_blocks_vector, lwe_indexes,
|
||||
test_vector_carry->lut, test_vector_carry->lut_indexes,
|
||||
&small_lwe_vector[message_count * (lwe_dimension + 1)],
|
||||
lwe_indexes, bsk, test_vector_carry->pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, carry_count, 1, 0,
|
||||
max_shared_memory, mem_ptr->params.pbs_type);
|
||||
|
||||
int rem_blocks = r % chunk_size * num_blocks;
|
||||
int new_blocks_created = 2 * ch_amount * num_blocks;
|
||||
int copy_size = rem_blocks * big_lwe_size * sizeof(Torus);
|
||||
|
||||
auto cur_dst = &new_blocks[new_blocks_created * big_lwe_size];
|
||||
auto cur_src = &old_blocks[(cur_total_blocks - rem_blocks) * big_lwe_size];
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
|
||||
|
||||
total_copied = 0;
|
||||
int total_radix_copied = 0;
|
||||
extract_message_carry_to_full_radix<Torus>(
|
||||
stream, old_blocks, new_blocks, f_b, l_b, ch_amount, big_lwe_size,
|
||||
total_copied, total_radix_copied, num_blocks, true);
|
||||
extract_message_carry_to_full_radix<Torus>(
|
||||
stream, old_blocks, new_blocks, f_b, l_b, ch_amount, big_lwe_size,
|
||||
total_copied, total_radix_copied, num_blocks, false);
|
||||
|
||||
std::swap(new_blocks, old_blocks);
|
||||
r = (new_blocks_created + rem_blocks) / num_blocks;
|
||||
}
|
||||
|
||||
dim3 add_grid(1, num_blocks, 1);
|
||||
size_t sm_size = big_lwe_size * sizeof(Torus);
|
||||
cuda_memset_async(radix_lwe_out, 0, num_blocks * big_lwe_size * sizeof(Torus),
|
||||
stream);
|
||||
tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
|
||||
radix_lwe_out, old_blocks, r, num_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, vector_result_sb, radix_lwe_out, bsk, ksk, num_blocks,
|
||||
test_vector_message);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, &block_mul_res[big_lwe_size], radix_lwe_out, bsk, ksk, num_blocks,
|
||||
test_vector_carry);
|
||||
|
||||
cuda_memset_async(block_mul_res, 0, big_lwe_size * sizeof(Torus), stream);
|
||||
|
||||
host_addition(stream, radix_lwe_out, vector_result_sb, block_mul_res,
|
||||
big_lwe_size, num_blocks);
|
||||
|
||||
host_propagate_single_carry_low_latency<Torus>(
|
||||
stream, radix_lwe_out, mem_ptr->scp_mem, bsk, ksk, num_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
|
||||
cuda_stream_t *stream, int_mul_memory<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
*mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
// Function to apply lookup table,
|
||||
// It has two mode
|
||||
// lsb_msb_mode == true - extracts lsb and msb
|
||||
// lsb_msb_mode == false - extracts message and carry
|
||||
template <typename Torus, typename STorus, class params>
|
||||
void apply_lookup_table(Torus *input_ciphertexts, Torus *output_ciphertexts,
|
||||
int_mul_memory<Torus> *mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t pbs_base_log, uint32_t pbs_level,
|
||||
uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor,
|
||||
uint32_t lsb_message_blocks_count,
|
||||
uint32_t msb_carry_blocks_count,
|
||||
uint32_t max_shared_memory, bool lsb_msb_mode) {
|
||||
|
||||
int total_blocks_count = lsb_message_blocks_count + msb_carry_blocks_count;
|
||||
int gpu_n = mem_ptr->p2p_gpu_count;
|
||||
if (total_blocks_count < gpu_n)
|
||||
gpu_n = total_blocks_count;
|
||||
int gpu_blocks_count = total_blocks_count / gpu_n;
|
||||
int big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
// int small_lwe_size = lwe_dimension + 1;
|
||||
|
||||
#pragma omp parallel for num_threads(gpu_n)
|
||||
for (int i = 0; i < gpu_n; i++) {
|
||||
cudaSetDevice(i);
|
||||
auto this_stream = mem_ptr->streams[i];
|
||||
// Index where input and output blocks start for current gpu
|
||||
int big_lwe_start_index = i * gpu_blocks_count * big_lwe_size;
|
||||
|
||||
// Last gpu might have extra blocks to process if total blocks number is not
|
||||
// divisible by gpu_n
|
||||
if (i == gpu_n - 1) {
|
||||
gpu_blocks_count += total_blocks_count % gpu_n;
|
||||
}
|
||||
|
||||
int can_access_peer;
|
||||
cudaDeviceCanAccessPeer(&can_access_peer, i, 0);
|
||||
if (i == 0) {
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(mem_ptr->pbs_output_multi_gpu[i],
|
||||
&input_ciphertexts[big_lwe_start_index],
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus),
|
||||
cudaMemcpyDeviceToDevice, *this_stream));
|
||||
} else if (can_access_peer) {
|
||||
check_cuda_error(cudaMemcpyPeerAsync(
|
||||
mem_ptr->pbs_output_multi_gpu[i], i,
|
||||
&input_ciphertexts[big_lwe_start_index], 0,
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus), *this_stream));
|
||||
} else {
|
||||
// Uses host memory as middle ground
|
||||
cuda_memcpy_async_to_cpu(mem_ptr->device_to_device_buffer[i],
|
||||
&input_ciphertexts[big_lwe_start_index],
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus),
|
||||
this_stream, i);
|
||||
cuda_memcpy_async_to_gpu(
|
||||
mem_ptr->pbs_output_multi_gpu[i], mem_ptr->device_to_device_buffer[i],
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus), this_stream, i);
|
||||
}
|
||||
|
||||
// when lsb and msb have to be extracted
|
||||
// for first lsb_count blocks we need lsb_acc
|
||||
// for last msb_count blocks we need msb_acc
|
||||
// when message and carry have tobe extracted
|
||||
// for first message_count blocks we need message_acc
|
||||
// for last carry_count blocks we need carry_acc
|
||||
Torus *cur_tvi;
|
||||
if (lsb_msb_mode) {
|
||||
cur_tvi = (big_lwe_start_index < lsb_message_blocks_count)
|
||||
? mem_ptr->tvi_lsb_multi_gpu[i]
|
||||
: mem_ptr->tvi_msb_multi_gpu[i];
|
||||
|
||||
} else {
|
||||
cur_tvi = (big_lwe_start_index < lsb_message_blocks_count)
|
||||
? mem_ptr->tvi_message_multi_gpu[i]
|
||||
: mem_ptr->tvi_carry_multi_gpu[i];
|
||||
}
|
||||
|
||||
// execute keyswitch on a current gpu with corresponding input and output
|
||||
// blocks pbs_output_multi_gpu[i] is an input for keyswitch and
|
||||
// pbs_input_multi_gpu[i] is an output for keyswitch
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
this_stream, i, mem_ptr->pbs_input_multi_gpu[i],
|
||||
mem_ptr->pbs_output_multi_gpu[i], mem_ptr->ksk_multi_gpu[i],
|
||||
polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
|
||||
gpu_blocks_count);
|
||||
|
||||
// execute pbs on a current gpu with corresponding input and output
|
||||
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
this_stream, i, mem_ptr->pbs_output_multi_gpu[i],
|
||||
mem_ptr->test_vector_multi_gpu[i], cur_tvi,
|
||||
mem_ptr->pbs_input_multi_gpu[i], mem_ptr->bsk_multi_gpu[i],
|
||||
mem_ptr->pbs_buffer_multi_gpu[i], lwe_dimension, glwe_dimension,
|
||||
polynomial_size, grouping_factor, pbs_base_log, pbs_level,
|
||||
grouping_factor, gpu_blocks_count, 2, 0, max_shared_memory);
|
||||
|
||||
// lookup table is applied and now data from current gpu have to be copied
|
||||
// back to gpu_0 in 'output_ciphertexts' buffer
|
||||
if (i == 0) {
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(&output_ciphertexts[big_lwe_start_index],
|
||||
mem_ptr->pbs_output_multi_gpu[i],
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus),
|
||||
cudaMemcpyDeviceToDevice, *this_stream));
|
||||
} else if (can_access_peer) {
|
||||
check_cuda_error(cudaMemcpyPeerAsync(
|
||||
&output_ciphertexts[big_lwe_start_index], 0,
|
||||
mem_ptr->pbs_output_multi_gpu[i], i,
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus), *this_stream));
|
||||
} else {
|
||||
// Uses host memory as middle ground
|
||||
cuda_memcpy_async_to_cpu(
|
||||
mem_ptr->device_to_device_buffer[i], mem_ptr->pbs_output_multi_gpu[i],
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus), this_stream, i);
|
||||
cuda_memcpy_async_to_gpu(&output_ciphertexts[big_lwe_start_index],
|
||||
mem_ptr->device_to_device_buffer[i],
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus),
|
||||
this_stream, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
|
||||
T *input_lwe_array,
|
||||
T scalar,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int lwe_size = lwe_dimension + 1;
|
||||
if (index < num_blocks * lwe_size) {
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output_lwe_array[index] = input_lwe_array[index] * scalar;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void host_integer_small_scalar_mult_radix(
|
||||
cuda_stream_t *stream, T *output_lwe_array, T *input_lwe_array, T scalar,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count * lwe_size;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
device_small_scalar_radix_multiplication<<<grid, thds, 0, stream->stream>>>(
|
||||
output_lwe_array, input_lwe_array, scalar, input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,12 @@
|
||||
#include "integer/negation.cuh"
|
||||
|
||||
void cuda_negate_integer_radix_ciphertext_64_inplace(
|
||||
cuda_stream_t *stream, void *lwe_array, uint32_t lwe_dimension,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
|
||||
host_integer_radix_negation(stream, static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<uint64_t *>(lwe_array), lwe_dimension,
|
||||
lwe_ciphertext_count, message_modulus,
|
||||
carry_modulus);
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
#ifndef CUDA_INTEGER_NEGATE_CUH
|
||||
#define CUDA_INTEGER_NEGATE_CUH
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
|
||||
uint64_t lwe_dimension, uint64_t message_modulus,
|
||||
uint64_t carry_modulus, uint64_t delta) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < lwe_dimension + 1) {
|
||||
bool is_body = (tid == lwe_dimension);
|
||||
|
||||
// z = ceil( degree / 2^p ) * 2^p
|
||||
uint64_t z = (2 * message_modulus - 1) / message_modulus;
|
||||
__syncthreads();
|
||||
z *= message_modulus;
|
||||
|
||||
// (0,Delta*z) - ct
|
||||
output[tid] = (is_body ? z * delta - input[tid] : -input[tid]);
|
||||
|
||||
for (int radix_block_id = 1; radix_block_id < num_blocks;
|
||||
radix_block_id++) {
|
||||
tid += (lwe_dimension + 1);
|
||||
|
||||
// Subtract z/B to the next ciphertext to compensate for the addition of z
|
||||
uint64_t zb = z / message_modulus;
|
||||
|
||||
uint64_t encoded_zb = zb * delta;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// (0,Delta*z) - ct
|
||||
output[tid] =
|
||||
(is_body ? z * delta - (input[tid] + encoded_zb) : -input[tid]);
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_negation(cuda_stream_t *stream, Torus *output,
|
||||
Torus *input, uint32_t lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint64_t message_modulus,
|
||||
uint64_t carry_modulus) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = lwe_dimension + 1;
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = lwe_size;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
uint64_t shared_mem = input_lwe_ciphertext_count * sizeof(uint32_t);
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
// If message_modulus and carry_modulus are always powers of 2 we can simplify
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_negation<<<grid, thds, shared_mem, stream->stream>>>(
|
||||
output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
|
||||
carry_modulus, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,12 @@
|
||||
#include "integer/scalar_addition.cuh"
|
||||
|
||||
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
cuda_stream_t *stream, void *lwe_array, void *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
|
||||
host_integer_radix_scalar_addition_inplace(
|
||||
stream, static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<uint64_t *>(scalar_input), lwe_dimension,
|
||||
lwe_ciphertext_count, message_modulus, carry_modulus);
|
||||
}
|
||||
@@ -0,0 +1,130 @@
|
||||
#ifndef CUDA_INTEGER_ADD_CUH
|
||||
#define CUDA_INTEGER_ADD_CUH
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <stdio.h>
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void device_integer_radix_scalar_addition_inplace(
|
||||
Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
|
||||
uint32_t lwe_dimension, uint64_t delta) {
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num_blocks) {
|
||||
Torus scalar = scalar_input[tid];
|
||||
Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
|
||||
|
||||
*body += scalar * delta;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_addition_inplace(
|
||||
cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
// If message_modulus and carry_modulus are always powers of 2 we can simplify
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_scalar_addition_inplace<<<grid, thds, 0,
|
||||
stream->stream>>>(
|
||||
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void device_integer_radix_add_scalar_one_inplace(
|
||||
Torus *lwe_array, int32_t num_blocks, uint32_t lwe_dimension,
|
||||
uint64_t delta) {
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num_blocks) {
|
||||
Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
|
||||
*body += delta;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_add_scalar_one_inplace(
|
||||
cuda_stream_t *stream, Torus *lwe_array, uint32_t lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
// If message_modulus and carry_modulus are always powers of 2 we can simplify
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0,
|
||||
stream->stream>>>(
|
||||
lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void device_integer_radix_scalar_subtraction_inplace(
|
||||
Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
|
||||
uint32_t lwe_dimension, uint64_t delta) {
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num_blocks) {
|
||||
Torus scalar = scalar_input[tid];
|
||||
Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
|
||||
|
||||
*body -= scalar * delta;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_subtraction_inplace(
|
||||
cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
// If message_modulus and carry_modulus are always powers of 2 we can simplify
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
|
||||
stream->stream>>>(
|
||||
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,14 @@
|
||||
#include "integer/scalar_bitops.cuh"
|
||||
|
||||
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_input,
|
||||
void *clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr, void *bsk,
|
||||
void *ksk, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
|
||||
|
||||
host_integer_radix_scalar_bitop_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_input),
|
||||
static_cast<uint64_t *>(clear_blocks), num_clear_blocks,
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
|
||||
lwe_ciphertext_count, op);
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
#ifndef CUDA_INTEGER_SCALAR_BITWISE_OPS_CUH
|
||||
#define CUDA_INTEGER_SCALAR_BITWISE_OPS_CUH
|
||||
|
||||
#include "integer/bitwise_ops.cuh"
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_bitop_kb(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_input,
|
||||
Torus *clear_blocks, uint32_t num_clear_blocks,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
uint32_t num_radix_blocks, BITOP_TYPE op) {
|
||||
|
||||
auto lut = mem_ptr->lut;
|
||||
auto params = lut->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
|
||||
uint32_t lwe_size = big_lwe_dimension + 1;
|
||||
|
||||
if (num_clear_blocks == 0) {
|
||||
if (op == SCALAR_BITAND) {
|
||||
auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
|
||||
cuda_memset_async(lwe_array_out, 0,
|
||||
num_radix_blocks * lwe_size * sizeof(Torus), stream);
|
||||
} else {
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array_out, lwe_array_input,
|
||||
num_radix_blocks * lwe_size * sizeof(Torus),
|
||||
stream);
|
||||
}
|
||||
} else {
|
||||
auto lut_buffer = lut->lut;
|
||||
// We have all possible LUTs pre-computed and we use the decomposed scalar
|
||||
// as index to recover the right one
|
||||
cuda_memcpy_async_gpu_to_gpu(lut->lut_indexes, clear_blocks,
|
||||
num_clear_blocks * sizeof(Torus), stream);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, lwe_array_input, bsk, ksk, num_clear_blocks,
|
||||
lut);
|
||||
|
||||
if (op == SCALAR_BITAND) {
|
||||
auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
|
||||
cuda_memset_async(lwe_array_out_block, 0,
|
||||
(num_radix_blocks - num_clear_blocks) * lwe_size *
|
||||
sizeof(Torus),
|
||||
stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,44 @@
|
||||
#include "integer/scalar_comparison.cuh"
|
||||
|
||||
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *scalar_blocks, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
switch (buffer->op) {
|
||||
// case EQ:
|
||||
// case NE:
|
||||
// host_integer_radix_equality_check_kb<uint64_t>(
|
||||
// stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
// static_cast<uint64_t *>(lwe_array_1),
|
||||
// static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
|
||||
// static_cast<uint64_t *>(ksk), glwe_dimension, polynomial_size,
|
||||
// big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log,
|
||||
// pbs_level, pbs_base_log, grouping_factor, lwe_ciphertext_count,
|
||||
// message_modulus, carry_modulus);
|
||||
// break;
|
||||
case GT:
|
||||
case GE:
|
||||
case LT:
|
||||
case LE:
|
||||
host_integer_radix_scalar_difference_check_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(scalar_blocks), buffer,
|
||||
buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
|
||||
lwe_ciphertext_count, num_scalar_blocks);
|
||||
break;
|
||||
case MAX:
|
||||
case MIN:
|
||||
host_integer_radix_scalar_maxmin_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
|
||||
static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
|
||||
break;
|
||||
default:
|
||||
printf("Not implemented\n");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,298 @@
|
||||
#ifndef CUDA_INTEGER_SCALAR_COMPARISON_OPS_CUH
|
||||
#define CUDA_INTEGER_SCALAR_COMPARISON_OPS_CUH
|
||||
|
||||
#include "integer/comparison.cuh"
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
auto diff_buffer = mem_ptr->diff_buffer;
|
||||
|
||||
size_t big_lwe_size = big_lwe_dimension + 1;
|
||||
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
// Reducing the signs is the bottleneck of the comparison algorithms,
|
||||
// however if the scalar case there is an improvement:
|
||||
//
|
||||
// The idea is to reduce the number of signs block we have to
|
||||
// reduce. We can do that by splitting the comparison problem in two parts.
|
||||
//
|
||||
// - One part where we compute the signs block between the scalar with just
|
||||
// enough blocks
|
||||
// from the ciphertext that can represent the scalar value
|
||||
//
|
||||
// - The other part is to compare the ciphertext blocks not considered for the
|
||||
// sign
|
||||
// computation with zero, and create a single sign block from that.
|
||||
//
|
||||
// The smaller the scalar value is compared to the ciphertext num bits
|
||||
// encrypted, the more the comparisons with zeros we have to do, and the less
|
||||
// signs block we will have to reduce.
|
||||
//
|
||||
// This will create a speedup as comparing a bunch of blocks with 0
|
||||
// is faster
|
||||
if (total_num_scalar_blocks == 0) {
|
||||
// We only have to compare blocks with zero
|
||||
// means scalar is zero
|
||||
host_compare_with_zero_equality(stream, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_in, mem_ptr, bsk, ksk,
|
||||
total_num_radix_blocks);
|
||||
|
||||
auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
|
||||
x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
|
||||
|
||||
return sign_handler_f(x);
|
||||
};
|
||||
|
||||
auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
|
||||
generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, scalar_last_leaf_lut_f);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsk, ksk, 1, lut);
|
||||
|
||||
// The result will be in the two first block. Everything else is
|
||||
// garbage.
|
||||
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
|
||||
big_lwe_size_bytes * (total_num_radix_blocks - 1),
|
||||
stream);
|
||||
|
||||
} else if (total_num_scalar_blocks < total_num_radix_blocks) {
|
||||
// We have to handle both part of the work described above
|
||||
|
||||
uint32_t num_lsb_radix_blocks = total_num_scalar_blocks;
|
||||
uint32_t num_msb_radix_blocks =
|
||||
total_num_radix_blocks - num_lsb_radix_blocks;
|
||||
|
||||
auto lsb = lwe_array_in;
|
||||
auto msb = lwe_array_in + num_lsb_radix_blocks * big_lwe_size;
|
||||
|
||||
auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
|
||||
auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;
|
||||
|
||||
cuda_synchronize_stream(stream);
|
||||
auto lsb_stream = diff_buffer->lsb_stream;
|
||||
auto msb_stream = diff_buffer->msb_stream;
|
||||
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// Both sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
//////////////
|
||||
// lsb
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
|
||||
num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_stream, rhs, scalar_blocks, 0, total_num_scalar_blocks,
|
||||
message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
|
||||
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb(lsb_stream, comparisons, lhs, rhs,
|
||||
mem_ptr, bsk, ksk, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(lsb_stream, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer,
|
||||
mem_ptr->cleaning_lut_f, bsk, ksk,
|
||||
num_lsb_radix_blocks);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
//////////////
|
||||
// msb
|
||||
host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
|
||||
mem_ptr, bsk, ksk,
|
||||
num_msb_radix_blocks);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(lsb_stream);
|
||||
cuda_synchronize_stream(msb_stream);
|
||||
|
||||
//////////////
|
||||
// Reduce the two blocks into one final
|
||||
|
||||
auto scalar_bivariate_last_leaf_lut_f =
|
||||
[sign_handler_f](Torus lsb, Torus msb) -> Torus {
|
||||
if (msb == 1)
|
||||
return sign_handler_f(lsb);
|
||||
else
|
||||
return sign_handler_f(IS_SUPERIOR);
|
||||
};
|
||||
|
||||
auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
|
||||
carry_modulus, scalar_bivariate_last_leaf_lut_f);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
stream, lwe_array_out, lwe_array_lsb_out, lwe_array_msb_out, bsk, ksk,
|
||||
1, lut);
|
||||
|
||||
// The result will be in the first block. Everything else is garbage.
|
||||
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
|
||||
(total_num_radix_blocks - 1) * big_lwe_size_bytes,
|
||||
stream);
|
||||
} else {
|
||||
// We only have to do the regular comparison
|
||||
// And not the part where we compare most significant blocks with zeros
|
||||
// total_num_radix_blocks == total_num_scalar_blocks
|
||||
uint32_t num_lsb_radix_blocks = total_num_radix_blocks;
|
||||
uint32_t num_scalar_blocks = total_num_scalar_blocks;
|
||||
|
||||
auto lsb = lwe_array_in;
|
||||
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(stream, lhs, lwe_array_in, big_lwe_dimension,
|
||||
num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(stream, rhs, scalar_blocks, 0, num_scalar_blocks,
|
||||
message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
num_scalar_blocks /= 2;
|
||||
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
auto comparisons = mem_ptr->tmp_lwe_array_out;
|
||||
scalar_compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk,
|
||||
ksk, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(stream, lwe_array_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, sign_handler_f, bsk,
|
||||
ksk, num_lsb_radix_blocks);
|
||||
|
||||
// The result will be in the first block. Everything else is garbage.
|
||||
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
|
||||
(total_num_radix_blocks - 1) * big_lwe_size_bytes,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
scalar_compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto small_lwe_dimension = params.small_lwe_dimension;
|
||||
auto ks_level = params.ks_level;
|
||||
auto ks_base_log = params.ks_base_log;
|
||||
auto pbs_level = params.pbs_level;
|
||||
auto pbs_base_log = params.pbs_base_log;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto grouping_factor = params.grouping_factor;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// When rhs > lhs, the subtraction will overflow, and the bit of padding will
|
||||
// be set to 1
|
||||
// meaning that the output of the pbs will be the negative (modulo message
|
||||
// space)
|
||||
//
|
||||
// Example:
|
||||
// lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
|
||||
// lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
|
||||
// Since there was an overflow the bit of padding is 1 and not 0.
|
||||
// When applying the LUT for an input value of 14 we would expect 1,
|
||||
// but since the bit of padding is 1, we will get -1 modulus our message
|
||||
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
|
||||
|
||||
auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
subtracted_blocks, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
|
||||
// Subtract
|
||||
// Here we need the true lwe sub, not the one that comes from shortint.
|
||||
host_integer_radix_scalar_subtraction_inplace(
|
||||
stream, subtracted_blocks, scalar_blocks, big_lwe_dimension,
|
||||
num_radix_blocks, message_modulus, carry_modulus);
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out,
|
||||
subtracted_blocks, bsk, ksk,
|
||||
num_radix_blocks, sign_lut);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
|
||||
big_lwe_dimension, num_radix_blocks,
|
||||
message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t total_num_radix_blocks,
|
||||
uint32_t total_num_scalar_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
// Calculates the difference sign between the ciphertext and the scalar
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
auto sign = mem_ptr->tmp_lwe_array_out;
|
||||
host_integer_radix_scalar_difference_check_kb(
|
||||
stream, sign, lwe_array_in, scalar_blocks, mem_ptr,
|
||||
mem_ptr->cleaning_lut_f, bsk, ksk, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
|
||||
// There is no optimized CMUX for scalars, so we convert to a trivial
|
||||
// ciphertext
|
||||
auto lwe_array_left = lwe_array_in;
|
||||
auto lwe_array_right = mem_ptr->tmp_block_comparisons;
|
||||
|
||||
create_trivial_radix(stream, lwe_array_right, scalar_blocks,
|
||||
params.big_lwe_dimension, total_num_radix_blocks,
|
||||
total_num_scalar_blocks, params.message_modulus,
|
||||
params.carry_modulus);
|
||||
|
||||
// Selector
|
||||
// CMUX for Max or Min
|
||||
host_integer_radix_cmux_kb(
|
||||
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,40 @@
|
||||
#include "scalar_rotate.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, SHIFT_TYPE shift_type, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
|
||||
stream, (int_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
shift_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_scalar_rotate_kb_64_inplace(cuda_stream_t *stream,
|
||||
void *lwe_array, uint32_t n,
|
||||
int8_t *mem_ptr, void *bsk,
|
||||
void *ksk,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array), n,
|
||||
(int_shift_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_shift_buffer<uint64_t> *mem_ptr =
|
||||
(int_shift_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
@@ -0,0 +1,114 @@
|
||||
#ifndef CUDA_INTEGER_SCALAR_ROTATE_OPS_CUH
|
||||
#define CUDA_INTEGER_SCALAR_ROTATE_OPS_CUH
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.cuh"
|
||||
#include "integer.h"
|
||||
#include "pbs/bootstrap_low_latency.cuh"
|
||||
#include "pbs/bootstrap_multibit.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
#ifndef CUDA_INTEGER_SHIFT_OPS_CUH
|
||||
#define CUDA_INTEGER_SHIFT_OPS_CUH
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.cuh"
|
||||
#include "integer.h"
|
||||
#include "pbs/bootstrap_low_latency.cuh"
|
||||
#include "pbs/bootstrap_multibit.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
|
||||
cuda_stream_t *stream, int_shift_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, SHIFT_TYPE shift_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_shift_buffer<Torus>(stream, shift_type, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
cuda_stream_t *stream, Torus *lwe_array, uint32_t n,
|
||||
int_shift_buffer<Torus> *mem, void *bsk, Torus *ksk, uint32_t num_blocks) {
|
||||
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
|
||||
size_t big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
size_t num_bits_in_message = (size_t)log2(message_modulus);
|
||||
size_t total_num_bits = num_bits_in_message * num_blocks;
|
||||
n = n % total_num_bits;
|
||||
|
||||
if (n == 0) {
|
||||
return;
|
||||
}
|
||||
size_t rotations = n / num_bits_in_message;
|
||||
size_t shift_within_block = n % num_bits_in_message;
|
||||
|
||||
Torus *rotated_buffer = mem->tmp_rotated;
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
// rotate right all the blocks in radix ciphertext
|
||||
// copy result in new buffer
|
||||
// 256 threads are used in every block
|
||||
// block_count blocks will be used in the grid
|
||||
// one block is responsible to process single lwe ciphertext
|
||||
if (mem->shift_type == LEFT_SHIFT) {
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, stream);
|
||||
|
||||
if (shift_within_block == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto receiver_blocks = lwe_array;
|
||||
auto giver_blocks = rotated_buffer;
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
giver_blocks, lwe_array, 1, num_blocks, big_lwe_size);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array, receiver_blocks, giver_blocks, bsk, ksk, num_blocks,
|
||||
lut_bivariate);
|
||||
|
||||
} else {
|
||||
// left shift
|
||||
radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, stream);
|
||||
|
||||
if (shift_within_block == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto receiver_blocks = lwe_array;
|
||||
auto giver_blocks = rotated_buffer;
|
||||
radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
giver_blocks, lwe_array, 1, num_blocks, big_lwe_size);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array, receiver_blocks, giver_blocks, bsk, ksk, num_blocks,
|
||||
lut_bivariate);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CUDA_SCALAR_OPS_CUH
|
||||
|
||||
#endif // CUDA_INTEGER_SCALAR_ROTATE_OPS_CUH
|
||||
@@ -0,0 +1,38 @@
|
||||
#include "scalar_shifts.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_scalar_shift_kb_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, SHIFT_TYPE shift_type, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_radix_scalar_shift_kb<uint64_t>(
|
||||
stream, (int_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
shift_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_scalar_shift_kb_64_inplace(
|
||||
cuda_stream_t *stream, void *lwe_array, uint32_t shift, int8_t *mem_ptr,
|
||||
void *bsk, void *ksk, uint32_t num_blocks) {
|
||||
|
||||
host_integer_radix_scalar_shift_kb_inplace<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array), shift,
|
||||
(int_shift_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_shift(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_shift_buffer<uint64_t> *mem_ptr =
|
||||
(int_shift_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
@@ -0,0 +1,125 @@
|
||||
#ifndef CUDA_INTEGER_SHIFT_OPS_CUH
|
||||
#define CUDA_INTEGER_SHIFT_OPS_CUH
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.cuh"
|
||||
#include "integer.h"
|
||||
#include "pbs/bootstrap_low_latency.cuh"
|
||||
#include "pbs/bootstrap_multibit.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_scalar_shift_kb(
|
||||
cuda_stream_t *stream, int_shift_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, SHIFT_TYPE shift_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_shift_buffer<Torus>(stream, shift_type, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_shift_kb_inplace(
|
||||
cuda_stream_t *stream, Torus *lwe_array, uint32_t shift,
|
||||
int_shift_buffer<Torus> *mem, void *bsk, Torus *ksk, uint32_t num_blocks) {
|
||||
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
|
||||
size_t big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
size_t num_bits_in_block = (size_t)log2(message_modulus);
|
||||
size_t total_num_bits = num_bits_in_block * num_blocks;
|
||||
shift = shift % total_num_bits;
|
||||
|
||||
if (shift == 0) {
|
||||
return;
|
||||
}
|
||||
size_t rotations = std::min(shift / num_bits_in_block, (size_t)num_blocks);
|
||||
size_t shift_within_block = shift % num_bits_in_block;
|
||||
|
||||
Torus *rotated_buffer = mem->tmp_rotated;
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
auto lut_univariate = mem->lut_buffers_univariate[shift_within_block];
|
||||
|
||||
// rotate right all the blocks in radix ciphertext
|
||||
// copy result in new buffer
|
||||
// 256 threads are used in every block
|
||||
// block_count blocks will be used in the grid
|
||||
// one block is responsible to process single lwe ciphertext
|
||||
if (mem->shift_type == LEFT_SHIFT) {
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
|
||||
|
||||
// create trivial assign for value = 0
|
||||
cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
|
||||
stream);
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, stream);
|
||||
|
||||
if (shift_within_block == 0 || rotations == num_blocks) {
|
||||
return;
|
||||
}
|
||||
|
||||
// check if we have enough blocks for partial processing
|
||||
if (rotations < num_blocks - 1) {
|
||||
auto partial_current_blocks = &lwe_array[(rotations + 1) * big_lwe_size];
|
||||
auto partial_previous_blocks = &lwe_array[rotations * big_lwe_size];
|
||||
|
||||
size_t partial_block_count = num_blocks - rotations - 1;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, partial_current_blocks, partial_current_blocks,
|
||||
partial_previous_blocks, bsk, ksk, partial_block_count,
|
||||
lut_bivariate);
|
||||
}
|
||||
|
||||
auto rest = &lwe_array[rotations * big_lwe_size];
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, rest, rest, bsk, ksk, 1, lut_univariate);
|
||||
|
||||
} else {
|
||||
// right shift
|
||||
radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
|
||||
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
// create trivial assign for value = 0
|
||||
cuda_memset_async(rotated_buffer + (num_blocks - rotations) * big_lwe_size,
|
||||
0, rotations * big_lwe_size_bytes, stream);
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, stream);
|
||||
|
||||
if (shift_within_block == 0 || rotations == num_blocks) {
|
||||
return;
|
||||
}
|
||||
|
||||
// check if we have enough blocks for partial processing
|
||||
if (rotations < num_blocks - 1) {
|
||||
auto partial_current_blocks = lwe_array;
|
||||
auto partial_next_blocks = &lwe_array[big_lwe_size];
|
||||
|
||||
size_t partial_block_count = num_blocks - rotations - 1;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, partial_current_blocks, partial_current_blocks,
|
||||
partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
|
||||
}
|
||||
|
||||
// The right-most block is done separately as it does not
|
||||
// need to recuperate the shifted bits from its right neighbour.
|
||||
auto last_block = &lwe_array[(num_blocks - rotations - 1) * big_lwe_size];
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, last_block, last_block, bsk, ksk, 1, lut_univariate);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CUDA_SCALAR_OPS_CUH
|
||||
@@ -0,0 +1,109 @@
|
||||
#include "linearalgebra/addition.cuh"
|
||||
|
||||
/*
|
||||
* Perform the addition of two u32 input LWE ciphertext vectors.
|
||||
* See the equivalent operation on u64 ciphertexts for more details.
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition(stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in_1),
|
||||
static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform the addition of two u64 input LWE ciphertext vectors.
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - `lwe_array_out` is an array of size
|
||||
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
|
||||
* been allocated on the GPU before calling this function, and that will hold
|
||||
* the result of the computation.
|
||||
* - `lwe_array_in_1` is the first LWE ciphertext vector used as input, it
|
||||
* should have been allocated and initialized before calling this function. It
|
||||
* has the same size as the output array.
|
||||
* - `lwe_array_in_2` is the second LWE ciphertext vector used as input, it
|
||||
* should have been allocated and initialized before calling this function. It
|
||||
* has the same size as the output array.
|
||||
* - `input_lwe_dimension` is the number of mask elements in the two input and
|
||||
* in the output ciphertext vectors
|
||||
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
|
||||
* input LWE ciphertext vector, as well as in the output.
|
||||
*
|
||||
* Each element (mask element or body) of the input LWE ciphertext vector 1 is
|
||||
* added to the corresponding element in the input LWE ciphertext 2. The result
|
||||
* is stored in the output LWE ciphertext vector. The two input LWE ciphertext
|
||||
* vectors are left unchanged. This function is a wrapper to a device function
|
||||
* that performs the operation on the GPU.
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition(stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in_1),
|
||||
static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
* Perform the addition of a u32 input LWE ciphertext vector with a u32
|
||||
* plaintext vector. See the equivalent operation on u64 data for more details.
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition_plaintext(stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(plaintext_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
* Perform the addition of a u64 input LWE ciphertext vector with a u64 input
|
||||
* plaintext vector.
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - `lwe_array_out` is an array of size
|
||||
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
|
||||
* been allocated on the GPU before calling this function, and that will hold
|
||||
* the result of the computation.
|
||||
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
|
||||
* been allocated and initialized before calling this function. It has the same
|
||||
* size as the output array.
|
||||
* - `plaintext_array_in` is the plaintext vector used as input, it should have
|
||||
* been allocated and initialized before calling this function. It should be of
|
||||
* size `input_lwe_ciphertext_count`.
|
||||
* - `input_lwe_dimension` is the number of mask elements in the input and
|
||||
* output LWE ciphertext vectors
|
||||
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
|
||||
* input LWE ciphertext vector, as well as in the output. It is also the number
|
||||
* of plaintexts in the input plaintext vector.
|
||||
*
|
||||
* Each plaintext of the input plaintext vector is added to the body of the
|
||||
* corresponding LWE ciphertext in the LWE ciphertext vector. The result of the
|
||||
* operation is stored in the output LWE ciphertext vector. The two input
|
||||
* vectors are unchanged. This function is a wrapper to a device function that
|
||||
* performs the operation on the GPU.
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition_plaintext(stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(plaintext_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
@@ -0,0 +1,154 @@
|
||||
#ifndef CUDA_ADD_CUH
|
||||
#define CUDA_ADD_CUH
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "../utils/kernel_dimensions.cuh"
|
||||
#include "device.h"
|
||||
#include "linear_algebra.h"
|
||||
#include <stdio.h>
|
||||
|
||||
template <typename T>
|
||||
__global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int plaintext_index = blockIdx.x * blockDim.x + tid;
|
||||
if (plaintext_index < num_entries) {
|
||||
int index =
|
||||
plaintext_index * (input_lwe_dimension + 1) + input_lwe_dimension;
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output[index] = lwe_input[index] + plaintext_input[plaintext_index];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void host_addition_plaintext(cuda_stream_t *stream, T *output,
|
||||
T *lwe_input, T *plaintext_input,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = lwe_ciphertext_count;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count, stream);
|
||||
plaintext_addition<<<grid, thds, 0, stream->stream>>>(
|
||||
output, lwe_input, plaintext_input, lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void addition(T *output, T *input_1, T *input_2,
|
||||
uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
if (index < num_entries) {
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output[index] = input_1[index] + input_2[index];
|
||||
}
|
||||
}
|
||||
|
||||
// Coefficient-wise addition
|
||||
template <typename T>
|
||||
__host__ void host_addition(cuda_stream_t *stream, T *output, T *input_1,
|
||||
T *input_2, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count * lwe_size;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
addition<<<grid, thds, 0, stream->stream>>>(output, input_1, input_2,
|
||||
num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void subtraction(T *output, T *input_1, T *input_2,
|
||||
uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
if (index < num_entries) {
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output[index] = input_1[index] - input_2[index];
|
||||
}
|
||||
}
|
||||
|
||||
// Coefficient-wise subtraction
|
||||
template <typename T>
|
||||
__host__ void host_subtraction(cuda_stream_t *stream, T *output, T *input_1,
|
||||
T *input_2, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count * lwe_size;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
subtraction<<<grid, thds, 0, stream->stream>>>(output, input_1, input_2,
|
||||
num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void radix_body_subtraction_inplace(T *lwe_ct, T *plaintext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int plaintext_index = blockIdx.x * blockDim.x + tid;
|
||||
if (plaintext_index < num_entries) {
|
||||
int index =
|
||||
plaintext_index * (input_lwe_dimension + 1) + input_lwe_dimension;
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
lwe_ct[index] -= plaintext_input[plaintext_index];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void host_subtraction_plaintext(cuda_stream_t *stream, T *output,
|
||||
T *lwe_input, T *plaintext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
|
||||
input_lwe_ciphertext_count *
|
||||
(input_lwe_dimension + 1) * sizeof(T),
|
||||
stream);
|
||||
|
||||
radix_body_subtraction_inplace<<<grid, thds, 0, stream->stream>>>(
|
||||
output, plaintext_input, input_lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
#endif // CUDA_ADD_H
|
||||
@@ -0,0 +1,56 @@
|
||||
#include "linearalgebra/multiplication.cuh"
|
||||
|
||||
/*
|
||||
* Perform the multiplication of a u32 input LWE ciphertext vector with a u32
|
||||
* cleartext vector. See the equivalent operation on u64 data for more details.
|
||||
*/
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_cleartext_multiplication(stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(cleartext_array_in),
|
||||
input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
* Perform the multiplication of a u64 input LWE ciphertext vector with a u64
|
||||
* input cleartext vector.
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - `lwe_array_out` is an array of size
|
||||
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
|
||||
* been allocated on the GPU before calling this function, and that will hold
|
||||
* the result of the computation.
|
||||
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
|
||||
* been allocated and initialized before calling this function. It has the same
|
||||
* size as the output array.
|
||||
* - `cleartext_array_in` is the cleartext vector used as input, it should have
|
||||
* been allocated and initialized before calling this function. It should be of
|
||||
* size `input_lwe_ciphertext_count`.
|
||||
* - `input_lwe_dimension` is the number of mask elements in the input and
|
||||
* output LWE ciphertext vectors
|
||||
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
|
||||
* input LWE ciphertext vector, as well as in the output. It is also the number
|
||||
* of cleartexts in the input cleartext vector.
|
||||
*
|
||||
* Each cleartext of the input cleartext vector is multiplied to the mask and
|
||||
* body of the corresponding LWE ciphertext in the LWE ciphertext vector. The
|
||||
* result of the operation is stored in the output LWE ciphertext vector. The
|
||||
* two input vectors are unchanged. This function is a wrapper to a device
|
||||
* function that performs the operation on the GPU.
|
||||
*/
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_cleartext_multiplication(stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(cleartext_array_in),
|
||||
input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
#ifndef CUDA_MULT_CUH
|
||||
#define CUDA_MULT_CUH
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "../utils/kernel_dimensions.cuh"
|
||||
#include "device.h"
|
||||
#include "linear_algebra.h"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
|
||||
uint32_t input_lwe_dimension, uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
if (index < num_entries) {
|
||||
int cleartext_index = index / (input_lwe_dimension + 1);
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output[index] = lwe_input[index] * cleartext_input[cleartext_index];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void
|
||||
host_cleartext_multiplication(cuda_stream_t *stream, T *output, T *lwe_input,
|
||||
T *cleartext_input, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count * lwe_size;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
cleartext_multiplication<<<grid, thds, 0, stream->stream>>>(
|
||||
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif // CUDA_MULT_H
|
||||
@@ -0,0 +1,49 @@
|
||||
#include "linearalgebra/negation.cuh"
|
||||
|
||||
/*
|
||||
* Perform the negation of a u32 input LWE ciphertext vector.
|
||||
* See the equivalent operation on u64 ciphertexts for more details.
|
||||
*/
|
||||
void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_negation(stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform the negation of a u64 input LWE ciphertext vector.
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - `lwe_array_out` is an array of size
|
||||
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
|
||||
* been allocated on the GPU before calling this function, and that will hold
|
||||
* the result of the computation.
|
||||
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
|
||||
* been allocated and initialized before calling this function. It has the same
|
||||
* size as the output array.
|
||||
* - `input_lwe_dimension` is the number of mask elements in the two input and
|
||||
* in the output ciphertext vectors
|
||||
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
|
||||
* input LWE ciphertext vector, as well as in the output.
|
||||
*
|
||||
* Each element (mask element or body) of the input LWE ciphertext vector is
|
||||
* negated. The result is stored in the output LWE ciphertext vector. The input
|
||||
* LWE ciphertext vector is left unchanged. This function is a wrapper to a
|
||||
* device function that performs the operation on the GPU.
|
||||
*/
|
||||
void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_negation(stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
#ifndef CUDA_NEGATE_CUH
|
||||
#define CUDA_NEGATE_CUH
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "../utils/kernel_dimensions.cuh"
|
||||
#include "device.h"
|
||||
#include "linear_algebra.h"
|
||||
|
||||
template <typename T>
|
||||
__global__ void negation(T *output, T *input, uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
if (index < num_entries) {
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output[index] = -input[index];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void host_negation(cuda_stream_t *stream, T *output, T *input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count * lwe_size;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
negation<<<grid, thds, 0, stream->stream>>>(output, input, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif // CUDA_NEGATE_H
|
||||
@@ -0,0 +1 @@
|
||||
#include "bootstrapping_key.cuh"
|
||||
@@ -0,0 +1,377 @@
|
||||
#include "bootstrap_amortized.cuh"
|
||||
|
||||
/*
|
||||
* Returns the buffer size for 64 bits executions
|
||||
*/
|
||||
uint64_t get_buffer_size_bootstrap_amortized_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
return get_buffer_size_bootstrap_amortized<uint64_t>(
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory);
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_fast_bootstrap_amortized(int polynomial_size) {
|
||||
assert(
|
||||
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192, 16384",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192 ||
|
||||
polynomial_size == 16384));
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_bootstrap_amortized(int nbits, int base_log, int polynomial_size) {
|
||||
assert(("Error (GPU amortized PBS): base log should be <= nbits",
|
||||
base_log <= nbits));
|
||||
checks_fast_bootstrap_amortized(polynomial_size);
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the amortized PBS on 32 bits inputs, into `pbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
|
||||
* be used.
|
||||
*/
|
||||
void scratch_cuda_bootstrap_amortized_32(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
checks_fast_bootstrap_amortized(polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<256>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<512>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<1024>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<2048>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<4096>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<8192>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<16384>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the amortized PBS on 64 bits inputs, into `pbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
|
||||
* be used.
|
||||
*/
|
||||
void scratch_cuda_bootstrap_amortized_64(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
checks_fast_bootstrap_amortized(polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform the programmable bootstrapping on a batch of input u32 LWE
|
||||
* ciphertexts. See the corresponding operation on 64 bits for more details.
|
||||
*/
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
checks_bootstrap_amortized(32, base_log, polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_bootstrap_amortized<uint32_t, AmortizedDegree<256>>(
|
||||
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
|
||||
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
|
||||
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
|
||||
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
|
||||
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
|
||||
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
host_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
|
||||
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform the programmable bootstrapping on a batch of input u64 LWE
|
||||
* ciphertexts. This functions performs best for large numbers of inputs (> 10).
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lut_vector: should hold as many test vectors of size polynomial_size
|
||||
* as there are input ciphertexts, but actually holds
|
||||
* num_lut_vectors vectors to reduce memory usage
|
||||
* - lut_vector_indexes: stores the index corresponding to
|
||||
* which test vector of lut_vector to use for each LWE input in
|
||||
* lwe_array_in
|
||||
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
* - bootstrapping_key: GGSW encryption of the LWE secret key sk1
|
||||
* under secret key sk2
|
||||
* bsk = Z + sk1 H
|
||||
* where H is the gadget matrix and Z is a matrix (k+1).l
|
||||
* containing GLWE encryptions of 0 under sk2.
|
||||
* bsk is thus a tensor of size (k+1)^2.l.N.n
|
||||
* where l is the number of decomposition levels and
|
||||
* k is the GLWE dimension, N is the polynomial size for
|
||||
* GLWE. The polynomial size for GLWE and the test vector
|
||||
* are the same because they have to be in the same ring
|
||||
* to be multiplied.
|
||||
* - input_lwe_dimension: size of the Torus vector used to encrypt the input
|
||||
* LWE ciphertexts - referred to as n above (~ 600)
|
||||
* - polynomial_size: size of the test polynomial (test vector) and size of the
|
||||
* GLWE polynomials (~1024) (where `size` refers to the polynomial degree + 1).
|
||||
* - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - level_count: number of decomposition levels in the gadget matrix (~4)
|
||||
* - num_samples: number of encrypted input messages
|
||||
* - num_lut_vectors: parameter to set the actual number of test vectors to be
|
||||
* used
|
||||
* - lwe_idx: the index of the LWE input to consider for the GPU of index
|
||||
* gpu_index. In case of multi-GPU computing, it is assumed that only a part of
|
||||
* the input LWE array is copied to each GPU, but the whole LUT array is copied
|
||||
* (because the case when the number of LUTs is smaller than the number of input
|
||||
* LWEs is not trivial to take into account in the data repartition on the
|
||||
* GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
|
||||
* input in the LUT array `lut_vector`.
|
||||
* - 'max_shared_memory' maximum amount of shared memory to be used inside
|
||||
* device functions
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* bootstrapping:
|
||||
* - the kernel is templatized based on integer discretization and
|
||||
* polynomial degree
|
||||
* - num_samples blocks of threads are launched, where each thread is going
|
||||
* to handle one or more polynomial coefficients at each stage:
|
||||
* - perform the blind rotation
|
||||
* - round the result
|
||||
* - decompose into level_count levels, then for each level:
|
||||
* - switch to the FFT domain
|
||||
* - multiply with the bootstrapping key
|
||||
* - come back to the coefficients representation
|
||||
* - between each stage a synchronization of the threads is necessary
|
||||
* - in case the device has enough shared memory, temporary arrays used for
|
||||
* the different stages (accumulators) are stored into the shared memory
|
||||
* - the accumulators serve to combine the results for all decomposition
|
||||
* levels
|
||||
* - the constant memory (64K) is used for storing the roots of identity
|
||||
* values for the FFT
|
||||
*/
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
checks_bootstrap_amortized(64, base_log, polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_bootstrap_amortized<uint64_t, AmortizedDegree<256>>(
|
||||
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
|
||||
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
|
||||
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
|
||||
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
|
||||
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
|
||||
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
host_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
|
||||
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This cleanup function frees the data for the amortized PBS on GPU in
|
||||
* pbs_buffer for 32 or 64 bits inputs.
|
||||
*/
|
||||
void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer) {
|
||||
|
||||
// Free memory
|
||||
cuda_drop_async(*pbs_buffer, stream);
|
||||
}
|
||||
@@ -0,0 +1,363 @@
|
||||
#ifndef CUDA_AMORTIZED_PBS_CUH
|
||||
#define CUDA_AMORTIZED_PBS_CUH
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
/*
|
||||
* Kernel launched by host_bootstrap_amortized
|
||||
*
|
||||
* Uses shared memory to increase performance
|
||||
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lut_vector: should hold as many test vectors of size polynomial_size
|
||||
* as there are input ciphertexts, but actually holds
|
||||
* num_lut_vectors vectors to reduce memory usage
|
||||
* - lut_vector_indexes: stores the index corresponding to which test vector
|
||||
* to use for each sample in lut_vector
|
||||
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1 under secret
|
||||
* key sk2
|
||||
* - device_mem: pointer to the device's global memory in case we use it (SMD
|
||||
* == NOSM or PARTIALSM)
|
||||
* - lwe_dimension: size of the Torus vector used to encrypt the input
|
||||
* LWE ciphertexts - referred to as n above (~ 600)
|
||||
* - polynomial_size: size of the test polynomial (test vector) and size of the
|
||||
* GLWE polynomial (~1024)
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - level_count: number of decomposition levels in the gadget matrix (~4)
|
||||
* - gpu_num: index of the current GPU (useful for multi-GPU computations)
|
||||
* - lwe_idx: equal to the number of samples per gpu x gpu_num
|
||||
* - device_memory_size_per_sample: amount of global memory to allocate if SMD
|
||||
* is not FULLSM
|
||||
*/
|
||||
__global__ void device_bootstrap_amortized(
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
double2 *bootstrapping_key, int8_t *device_mem, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t lwe_idx,
|
||||
size_t device_memory_size_per_sample) {
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory;
|
||||
|
||||
if constexpr (SMD == FULLSM)
|
||||
selected_memory = sharedmem;
|
||||
else
|
||||
selected_memory = &device_mem[blockIdx.x * device_memory_size_per_sample];
|
||||
|
||||
// For GPU bootstrapping the GLWE dimension is hard-set to 1: there is only
|
||||
// one mask polynomial and 1 body to handle.
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
Torus *accumulator_rotated =
|
||||
(Torus *)accumulator +
|
||||
(ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
|
||||
double2 *res_fft =
|
||||
(double2 *)accumulator_rotated + (glwe_dimension + 1) * polynomial_size /
|
||||
(sizeof(double2) / sizeof(Torus));
|
||||
double2 *accumulator_fft = (double2 *)sharedmem;
|
||||
if constexpr (SMD != PARTIALSM)
|
||||
accumulator_fft = (double2 *)res_fft +
|
||||
(ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);
|
||||
|
||||
auto block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
|
||||
(glwe_dimension + 1)];
|
||||
|
||||
// Put "b", the body, in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, block_lut_vector, b_hat, false, glwe_dimension + 1);
|
||||
|
||||
// Loop over all the mask elements of the sample to accumulate
|
||||
// (X^a_i-1) multiplication, decomposition of the resulting polynomial
|
||||
// into level_count polynomials, and performing polynomial multiplication
|
||||
// via an FFT with the RGSW encrypted secret key
|
||||
for (int iteration = 0; iteration < lwe_dimension; iteration++) {
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Put "a" in [0, 2N[ instead of Zq
|
||||
Torus a_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[iteration], a_hat,
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat, glwe_dimension + 1);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, level_count, glwe_dimension + 1);
|
||||
|
||||
// Initialize the polynomial multiplication via FFT arrays
|
||||
// The polynomial multiplications happens at the block level
|
||||
// and each thread handles two or more coefficients
|
||||
int pos = threadIdx.x;
|
||||
for (int i = 0; i < (glwe_dimension + 1); i++)
|
||||
for (int j = 0; j < params::opt / 2; j++) {
|
||||
res_fft[pos].x = 0;
|
||||
res_fft[pos].y = 0;
|
||||
pos += params::degree / params::opt;
|
||||
}
|
||||
|
||||
GadgetMatrix<Torus, params> gadget(base_log, level_count,
|
||||
accumulator_rotated, glwe_dimension + 1);
|
||||
// Now that the rotation is done, decompose the resulting polynomial
|
||||
// coefficients so as to multiply each decomposed level with the
|
||||
// corresponding part of the bootstrapping key
|
||||
for (int level = level_count - 1; level >= 0; level--) {
|
||||
for (int i = 0; i < (glwe_dimension + 1); i++) {
|
||||
gadget.decompose_and_compress_next_polynomial(accumulator_fft, i);
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
|
||||
// Get the bootstrapping key piece necessary for the multiplication
|
||||
// It is already in the Fourier domain
|
||||
auto bsk_slice = get_ith_mask_kth_block(bootstrapping_key, iteration, i,
|
||||
level, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
|
||||
// Perform the coefficient-wise product with the two pieces of
|
||||
// bootstrapping key
|
||||
for (int j = 0; j < (glwe_dimension + 1); j++) {
|
||||
auto bsk_poly = bsk_slice + j * params::degree / 2;
|
||||
auto res_fft_poly = res_fft + j * params::degree / 2;
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
res_fft_poly, accumulator_fft, bsk_poly);
|
||||
}
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
// Come back to the coefficient representation
|
||||
if constexpr (SMD == FULLSM || SMD == NOSM) {
|
||||
synchronize_threads_in_block();
|
||||
|
||||
for (int i = 0; i < (glwe_dimension + 1); i++) {
|
||||
auto res_fft_slice = res_fft + i * params::degree / 2;
|
||||
NSMFFT_inverse<HalfDegree<params>>(res_fft_slice);
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
for (int i = 0; i < (glwe_dimension + 1); i++) {
|
||||
auto accumulator_slice = accumulator + i * params::degree;
|
||||
auto res_fft_slice = res_fft + i * params::degree / 2;
|
||||
add_to_torus<Torus, params>(res_fft_slice, accumulator_slice);
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < (glwe_dimension + 1); i++) {
|
||||
auto accumulator_slice = accumulator + i * params::degree;
|
||||
auto res_fft_slice = res_fft + i * params::degree / 2;
|
||||
int tid = threadIdx.x;
|
||||
for (int j = 0; j < params::opt / 2; j++) {
|
||||
accumulator_fft[tid] = res_fft_slice[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator_slice);
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
}
|
||||
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.x] *
|
||||
(glwe_dimension * polynomial_size + 1)];
|
||||
|
||||
// The blind rotation for this block is over
|
||||
// Now we can perform the sample extraction: for the body it's just
|
||||
// the resulting constant coefficient of the accumulator
|
||||
// For the mask it's more complicated
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
|
||||
glwe_dimension);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
|
||||
glwe_dimension);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_full_sm_bootstrap_amortized(
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension) {
|
||||
return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
|
||||
sizeof(Torus) * polynomial_size *
|
||||
(glwe_dimension + 1) + // accumulator rotated
|
||||
sizeof(double2) * polynomial_size / 2 + // accumulator fft
|
||||
sizeof(double2) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1); // res fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_bootstrap_amortized(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_bootstrap_amortized(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
|
||||
uint64_t partial_dm = full_sm - partial_sm;
|
||||
uint64_t full_dm = full_sm;
|
||||
uint64_t device_mem = 0;
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count;
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_mem = partial_dm * input_lwe_ciphertext_count;
|
||||
}
|
||||
return device_mem + device_mem % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_bootstrap_amortized(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
partial_sm);
|
||||
cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
if (allocate_gpu_memory) {
|
||||
uint64_t buffer_size = get_buffer_size_bootstrap_amortized<Torus>(
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_amortized(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
uint64_t SM_FULL = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
|
||||
uint64_t SM_PART =
|
||||
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
|
||||
|
||||
uint64_t DM_PART = SM_FULL - SM_PART;
|
||||
|
||||
uint64_t DM_FULL = SM_FULL;
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
// where each block handles 1 sample and each thread
|
||||
// handles opt polynomial coefficients
|
||||
// (actually opt/2 coefficients since we compress the real polynomial into a
|
||||
// complex)
|
||||
dim3 grid(input_lwe_ciphertext_count, 1, 1);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
// Launch the kernel using polynomial_size/opt threads
|
||||
// where each thread computes opt polynomial coefficients
|
||||
// Depending on the required amount of shared memory, choose
|
||||
// from one of three templates (no use, partial use or full use
|
||||
// of shared memory)
|
||||
if (max_shared_memory < SM_PART) {
|
||||
device_bootstrap_amortized<Torus, params, NOSM>
|
||||
<<<grid, thds, 0, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, lwe_idx, DM_FULL);
|
||||
} else if (max_shared_memory < SM_FULL) {
|
||||
device_bootstrap_amortized<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, SM_PART, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, lwe_idx, DM_PART);
|
||||
} else {
|
||||
// For devices with compute capability 7.x a single thread block can
|
||||
// address the full capacity of shared memory. Shared memory on the
|
||||
// device then has to be allocated dynamically.
|
||||
// For lower compute capabilities, this call
|
||||
// just does nothing and the amount of shared memory used is 48 KB
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>
|
||||
<<<grid, thds, SM_FULL, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, lwe_idx, 0);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
int cuda_get_pbs_per_gpu(int polynomial_size) {
|
||||
|
||||
int blocks_per_sm = 0;
|
||||
int num_threads = polynomial_size / params::opt;
|
||||
cudaGetDeviceCount(0);
|
||||
cudaDeviceProp device_properties;
|
||||
cudaGetDeviceProperties(&device_properties, 0);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&blocks_per_sm, device_bootstrap_amortized<Torus, params>, num_threads,
|
||||
0);
|
||||
|
||||
return device_properties.multiProcessorCount * blocks_per_sm;
|
||||
}
|
||||
|
||||
#endif // CNCRT_PBS_H
|
||||
@@ -0,0 +1,453 @@
|
||||
#ifndef CUDA_FAST_LOWLAT_PBS_CUH
|
||||
#define CUDA_FAST_LOWLAT_PBS_CUH
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
// Cooperative groups are used in the low latency PBS
|
||||
using namespace cooperative_groups;
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
|
||||
double2 *join_buffer, double2 *bootstrapping_key,
|
||||
int polynomial_size, uint32_t glwe_dimension,
|
||||
int level_count, int iteration,
|
||||
grid_group &grid) {
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Get the pieces of the bootstrapping key that will be needed for the
|
||||
// external product; blockIdx.x is the ID of the block that's executing
|
||||
// this function, so we end up getting the lines of the bootstrapping key
|
||||
// needed to perform the external product in this block (corresponding to
|
||||
// the same decomposition level)
|
||||
auto bsk_slice = get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
|
||||
// Selects all GLWEs in a particular decomposition level
|
||||
auto level_join_buffer =
|
||||
join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2;
|
||||
|
||||
// Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
// each block operating on a single level for mask and body
|
||||
|
||||
// The first product is used to initialize level_join_buffer
|
||||
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
|
||||
auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
grid.sync();
|
||||
|
||||
// Continues multiplying fft by every polynomial in that particular bsk level
|
||||
// Each y-block accumulates in a different polynomial at each iteration
|
||||
for (int j = 1; j < (glwe_dimension + 1); j++) {
|
||||
int idx = (j + blockIdx.y) % (glwe_dimension + 1);
|
||||
|
||||
auto bsk_poly = bsk_slice + idx * params::degree / 2;
|
||||
auto buffer_slice = level_join_buffer + idx * params::degree / 2;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
grid.sync();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// All blocks are synchronized here; after this sync, level_join_buffer has
|
||||
// the values needed from every other block
|
||||
|
||||
auto src_acc = join_buffer + blockIdx.y * params::degree / 2;
|
||||
|
||||
// copy first product into fft buffer
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 1; l < gridDim.x; l++) {
|
||||
auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2];
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] += cur_src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
|
||||
// accumulator
|
||||
NSMFFT_inverse<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(fft, accumulator);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
/*
|
||||
* Kernel launched by the low latency version of the
|
||||
* bootstrapping, that uses cooperative groups
|
||||
*
|
||||
* - lwe_array_out: vector of output lwe s, with length
|
||||
* (glwe_dimension * polynomial_size+1)*num_samples
|
||||
* - lut_vector: vector of look up tables with
|
||||
* length (glwe_dimension+1) * polynomial_size * num_samples
|
||||
* - lut_vector_indexes: mapping between lwe_array_in and lut_vector
|
||||
* lwe_array_in: vector of lwe inputs with length (lwe_dimension + 1) *
|
||||
* num_samples
|
||||
*
|
||||
* Each y-block computes one element of the lwe_array_out.
|
||||
*/
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_bootstrap_fast_low_latency(
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory;
|
||||
uint32_t glwe_dimension = gridDim.y - 1;
|
||||
|
||||
if constexpr (SMD == FULLSM) {
|
||||
selected_memory = sharedmem;
|
||||
} else {
|
||||
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
|
||||
blockIdx.z * gridDim.x * gridDim.y;
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
}
|
||||
|
||||
// We always compute the pointer with most restrictive alignment to avoid
|
||||
// alignment issues
|
||||
double2 *accumulator_fft = (double2 *)selected_memory;
|
||||
Torus *accumulator =
|
||||
(Torus *)accumulator_fft +
|
||||
(ptrdiff_t)(sizeof(double2) * polynomial_size / 2 / sizeof(Torus));
|
||||
Torus *accumulator_rotated =
|
||||
(Torus *)accumulator + (ptrdiff_t)polynomial_size;
|
||||
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
accumulator_fft = (double2 *)sharedmem;
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
|
||||
|
||||
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
|
||||
double2 *block_join_buffer =
|
||||
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
|
||||
params::degree / 2];
|
||||
// Since the space is L1 cache is small, we use the same memory location for
|
||||
// the rotated accumulator and the fft accumulator, since we know that the
|
||||
// rotated array is not in use anymore by the time we perform the fft
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[i], a_hat,
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, level_count);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
|
||||
polynomial_size, glwe_dimension, level_count, i, grid);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result, but
|
||||
// we do the computation at block 0 to avoid waiting for extra blocks, in
|
||||
// case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_bootstrap_fast_low_latency(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_bootstrap_fast_low_latency(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_bootstrap_fast_low_latency(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_dm = full_sm - partial_sm;
|
||||
uint64_t full_dm = full_sm;
|
||||
uint64_t device_mem = 0;
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
}
|
||||
uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2);
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_bootstrap_fast_low_latency(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_fast_low_latency<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_bootstrap_fast_low_latency<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
if (allocate_gpu_memory) {
|
||||
uint64_t buffer_size = get_buffer_size_bootstrap_fast_low_latency<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Host wrapper to the low latency version
|
||||
* of bootstrapping
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_fast_low_latency(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
|
||||
uint32_t max_shared_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
uint64_t full_dm = full_sm;
|
||||
|
||||
uint64_t partial_dm = full_dm - partial_sm;
|
||||
|
||||
int8_t *d_mem = pbs_buffer;
|
||||
double2 *buffer_fft =
|
||||
(double2 *)d_mem +
|
||||
(ptrdiff_t)(get_buffer_size_bootstrap_fast_low_latency<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory) /
|
||||
sizeof(double2) -
|
||||
(glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count * polynomial_size / 2);
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
|
||||
|
||||
void *kernel_args[14];
|
||||
kernel_args[0] = &lwe_array_out;
|
||||
kernel_args[1] = &lwe_output_indexes;
|
||||
kernel_args[2] = &lut_vector;
|
||||
kernel_args[3] = &lut_vector_indexes;
|
||||
kernel_args[4] = &lwe_array_in;
|
||||
kernel_args[5] = &lwe_input_indexes;
|
||||
kernel_args[6] = &bootstrapping_key;
|
||||
kernel_args[7] = &buffer_fft;
|
||||
kernel_args[8] = &lwe_dimension;
|
||||
kernel_args[9] = &polynomial_size;
|
||||
kernel_args[10] = &base_log;
|
||||
kernel_args[11] = &level_count;
|
||||
kernel_args[12] = &d_mem;
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
kernel_args[13] = &full_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_fast_low_latency<Torus, params, NOSM>, grid,
|
||||
thds, (void **)kernel_args, 0, stream->stream));
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
kernel_args[13] = &partial_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
|
||||
grid, thds, (void **)kernel_args, partial_sm, stream->stream));
|
||||
} else {
|
||||
int no_dm = 0;
|
||||
kernel_args[13] = &no_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_fast_low_latency<Torus, params, FULLSM>, grid,
|
||||
thds, (void **)kernel_args, full_sm, stream->stream));
|
||||
}
|
||||
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
// Verify if the grid size for the low latency kernel satisfies the cooperative
|
||||
// group constraints
|
||||
template <typename Torus, class params>
|
||||
__host__ bool verify_cuda_bootstrap_fast_low_latency_grid_size(
|
||||
int glwe_dimension, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
// If Cooperative Groups is not supported, no need to check anything else
|
||||
if (!cuda_check_support_cooperative_groups())
|
||||
return false;
|
||||
|
||||
// Calculate the dimension of the kernel
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(params::degree);
|
||||
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
|
||||
params::degree);
|
||||
|
||||
int thds = params::degree / params::opt;
|
||||
|
||||
// Get the maximum number of active blocks per streaming multiprocessors
|
||||
int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
|
||||
int max_active_blocks_per_sm;
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_bootstrap_fast_low_latency<Torus, params, NOSM>, thds,
|
||||
0);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
|
||||
thds, 0);
|
||||
} else {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_bootstrap_fast_low_latency<Torus, params, FULLSM>, thds,
|
||||
0);
|
||||
}
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
#endif // LOWLAT_FAST_PBS_H
|
||||
@@ -0,0 +1,321 @@
|
||||
#ifndef CUDA_FAST_MULTIBIT_PBS_CUH
|
||||
#define CUDA_FAST_MULTIBIT_PBS_CUH
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "bootstrap_multibit.cuh"
|
||||
#include "bootstrap_multibit.h"
|
||||
#include "cooperative_groups.h"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/ggsw.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void device_multi_bit_bootstrap_fast_accumulate(
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
double2 *keybundle_array, double2 *join_buffer, Torus *global_accumulator,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t lwe_offset, uint32_t lwe_chunk_size,
|
||||
uint32_t keybundle_size_per_input) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory;
|
||||
|
||||
selected_memory = sharedmem;
|
||||
|
||||
// We always compute the pointer with most restrictive alignment to avoid
|
||||
// alignment issues
|
||||
double2 *accumulator_fft = (double2 *)selected_memory;
|
||||
Torus *accumulator =
|
||||
(Torus *)accumulator_fft +
|
||||
(ptrdiff_t)(sizeof(double2) * polynomial_size / 2 / sizeof(Torus));
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
|
||||
|
||||
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
|
||||
double2 *block_join_buffer =
|
||||
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
|
||||
params::degree / 2];
|
||||
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
|
||||
|
||||
double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
blockIdx.z * keybundle_size_per_input;
|
||||
|
||||
if (lwe_offset == 0) {
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
global_slice, accumulator);
|
||||
}
|
||||
|
||||
for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, params>(accumulator, accumulator_fft,
|
||||
block_join_buffer, keybundle, polynomial_size,
|
||||
glwe_dimension, level_count, i, grid);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
}
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, global_slice);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_fast_multibit_bootstrap(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size * 2; // accumulator
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_fast_multibit_bootstrap(
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t grouping_factor, uint32_t lwe_chunk_size,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
uint64_t buffer_size = 0;
|
||||
buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2) * sizeof(double2); // keybundle fft
|
||||
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
|
||||
level_count * (polynomial_size / 2) *
|
||||
sizeof(double2); // join buffer
|
||||
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
|
||||
polynomial_size * sizeof(Torus); // global_accumulator
|
||||
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_fast_multi_bit_pbs(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size = 0) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_accumulate =
|
||||
get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(polynomial_size);
|
||||
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_bootstrap_keybundle<Torus, params>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
|
||||
cudaFuncSetCacheConfig(device_multi_bit_bootstrap_keybundle<Torus, params>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_average_lwe_chunk_size(lwe_dimension, level_count,
|
||||
glwe_dimension);
|
||||
|
||||
uint64_t buffer_size = get_buffer_size_fast_multibit_bootstrap<Torus>(
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, lwe_chunk_size,
|
||||
max_shared_memory);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
__host__ void host_fast_multi_bit_pbs(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, uint64_t *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size =
|
||||
get_average_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension);
|
||||
|
||||
//
|
||||
double2 *keybundle_fft = (double2 *)pbs_buffer;
|
||||
double2 *buffer_fft = (double2 *)keybundle_fft +
|
||||
num_samples * lwe_chunk_size * level_count *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2);
|
||||
Torus *global_accumulator =
|
||||
(Torus *)buffer_fft +
|
||||
(ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
|
||||
level_count * (polynomial_size / 2) / sizeof(Torus));
|
||||
|
||||
//
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_accumulate =
|
||||
get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(polynomial_size);
|
||||
|
||||
uint32_t keybundle_size_per_input =
|
||||
lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2);
|
||||
|
||||
//
|
||||
void *kernel_args[18];
|
||||
kernel_args[0] = &lwe_array_out;
|
||||
kernel_args[1] = &lwe_output_indexes;
|
||||
kernel_args[2] = &lut_vector;
|
||||
kernel_args[3] = &lut_vector_indexes;
|
||||
kernel_args[4] = &lwe_array_in;
|
||||
kernel_args[5] = &lwe_input_indexes;
|
||||
kernel_args[6] = &keybundle_fft;
|
||||
kernel_args[7] = &buffer_fft;
|
||||
kernel_args[8] = &global_accumulator;
|
||||
kernel_args[9] = &lwe_dimension;
|
||||
kernel_args[10] = &glwe_dimension;
|
||||
kernel_args[11] = &polynomial_size;
|
||||
kernel_args[12] = &base_log;
|
||||
kernel_args[13] = &level_count;
|
||||
kernel_args[14] = &grouping_factor;
|
||||
kernel_args[17] = &keybundle_size_per_input;
|
||||
|
||||
//
|
||||
dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
uint32_t chunk_size = std::min(
|
||||
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
|
||||
// Compute a keybundle
|
||||
dim3 grid_keybundle(num_samples * chunk_size,
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1),
|
||||
level_count);
|
||||
device_multi_bit_bootstrap_keybundle<Torus, params>
|
||||
<<<grid_keybundle, thds, full_sm_keybundle, stream->stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, lwe_offset, chunk_size,
|
||||
keybundle_size_per_input);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
kernel_args[15] = &lwe_offset;
|
||||
kernel_args[16] = &chunk_size;
|
||||
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
|
||||
grid_accumulate, thds, (void **)kernel_args, full_sm_accumulate,
|
||||
stream->stream));
|
||||
}
|
||||
}
|
||||
|
||||
// Verify if the grid size for the low latency kernel satisfies the cooperative
|
||||
// group constraints
|
||||
template <typename Torus, class params>
|
||||
__host__ bool
|
||||
verify_cuda_bootstrap_fast_multi_bit_grid_size(int glwe_dimension,
|
||||
int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
// If Cooperative Groups is not supported, no need to check anything else
|
||||
if (!cuda_check_support_cooperative_groups())
|
||||
return false;
|
||||
|
||||
// Calculate the dimension of the kernel
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(params::degree);
|
||||
|
||||
int thds = params::degree / params::opt;
|
||||
|
||||
// Get the maximum number of active blocks per streaming multiprocessors
|
||||
int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
|
||||
int max_active_blocks_per_sm;
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_multi_bit_bootstrap_fast_accumulate<Torus, params>, thds,
|
||||
full_sm);
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
#endif // FASTMULTIBIT_PBS_H
|
||||
@@ -0,0 +1,845 @@
|
||||
#include "bootstrap_fast_low_latency.cuh"
|
||||
#include "bootstrap_low_latency.cuh"
|
||||
/*
|
||||
* Returns the buffer size for 64 bits executions
|
||||
*/
|
||||
uint64_t get_buffer_size_bootstrap_low_latency_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
|
||||
AmortizedDegree<256>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
else
|
||||
return get_buffer_size_bootstrap_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
|
||||
AmortizedDegree<512>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
else
|
||||
return get_buffer_size_bootstrap_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
else
|
||||
return get_buffer_size_bootstrap_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
else
|
||||
return get_buffer_size_bootstrap_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
else
|
||||
return get_buffer_size_bootstrap_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
else
|
||||
return get_buffer_size_bootstrap_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
|
||||
uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
|
||||
input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
else
|
||||
return get_buffer_size_bootstrap_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_fast_bootstrap_low_latency(int glwe_dimension, int level_count,
|
||||
int polynomial_size, int num_samples) {
|
||||
|
||||
assert((
|
||||
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192, 16384",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192 ||
|
||||
polynomial_size == 16384));
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_bootstrap_low_latency(int nbits, int glwe_dimension,
|
||||
int level_count, int base_log,
|
||||
int polynomial_size, int num_samples) {
|
||||
assert(("Error (GPU low latency PBS): base log should be <= nbits",
|
||||
base_log <= nbits));
|
||||
checks_fast_bootstrap_low_latency(glwe_dimension, level_count,
|
||||
polynomial_size, num_samples);
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the low latency PBS on 32 bits inputs, into `pbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
|
||||
* be used.
|
||||
*/
|
||||
void scratch_cuda_bootstrap_low_latency_32(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
checks_fast_bootstrap_low_latency(
|
||||
glwe_dimension, level_count, polynomial_size, input_lwe_ciphertext_count);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<256>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
|
||||
AmortizedDegree<256>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<256>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<512>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
|
||||
AmortizedDegree<512>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<512>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<2048>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
|
||||
AmortizedDegree<2048>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<2048>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<4096>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
|
||||
AmortizedDegree<4096>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<4096>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<8192>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
|
||||
AmortizedDegree<8192>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<8192>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
|
||||
uint32_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
|
||||
input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
|
||||
AmortizedDegree<16384>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<16384>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the low_latency PBS on 64 bits inputs, into `pbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
|
||||
* be used.
|
||||
*/
|
||||
void scratch_cuda_bootstrap_low_latency_64(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
checks_fast_bootstrap_low_latency(
|
||||
glwe_dimension, level_count, polynomial_size, input_lwe_ciphertext_count);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
|
||||
AmortizedDegree<256>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
|
||||
AmortizedDegree<256>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<256>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
|
||||
AmortizedDegree<512>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
|
||||
AmortizedDegree<512>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<512>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<1024>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<2048>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<4096>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<8192>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
|
||||
uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
|
||||
input_lwe_ciphertext_count,
|
||||
max_shared_memory))
|
||||
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<16384>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
|
||||
* This function performs best for small numbers of inputs. Beyond a certain
|
||||
* number of inputs (the exact number depends on the cryptographic parameters),
|
||||
* the kernel cannot be launched and it is necessary to split the kernel call
|
||||
* into several calls on smaller batches of inputs. For more details on this
|
||||
* operation, head on to the equivalent u64 operation.
|
||||
*/
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
checks_bootstrap_low_latency(32, glwe_dimension, level_count, base_log,
|
||||
polynomial_size, num_samples);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<256>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory))
|
||||
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<256>>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
else
|
||||
host_bootstrap_low_latency<uint32_t, Degree<256>>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<512>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory))
|
||||
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<512>>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
else
|
||||
host_bootstrap_low_latency<uint32_t, Degree<512>>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<1024>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory))
|
||||
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<1024>>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
else
|
||||
host_bootstrap_low_latency<uint32_t, Degree<1024>>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<2048>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory))
|
||||
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<2048>>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
else
|
||||
host_bootstrap_low_latency<uint32_t, Degree<2048>>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<4096>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory))
|
||||
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<4096>>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
else
|
||||
host_bootstrap_low_latency<uint32_t, Degree<4096>>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<8192>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory))
|
||||
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<8192>>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
else
|
||||
host_bootstrap_low_latency<uint32_t, Degree<8192>>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
|
||||
uint32_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
|
||||
num_samples, max_shared_memory))
|
||||
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<16384>>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
else
|
||||
host_bootstrap_low_latency<uint32_t, Degree<16384>>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform bootstrapping on a batch of input u64 LWE ciphertexts.
|
||||
* This function performs best for small numbers of inputs. Beyond a certain
|
||||
* number of inputs (the exact number depends on the cryptographic parameters),
|
||||
* the kernel cannot be launched and it is necessary to split the kernel call
|
||||
* into several calls on smaller batches of inputs.
|
||||
*
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lut_vector: should hold as many test vectors of size polynomial_size
|
||||
* as there are input ciphertexts, but actually holds
|
||||
* num_lut_vectors vectors to reduce memory usage
|
||||
* - lut_vector_indexes: stores the index corresponding to
|
||||
* which test vector to use for each sample in
|
||||
* lut_vector
|
||||
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
* - bootstrapping_key: GGSW encryption of the LWE secret key sk1
|
||||
* under secret key sk2
|
||||
* bsk = Z + sk1 H
|
||||
* where H is the gadget matrix and Z is a matrix (k+1).l
|
||||
* containing GLWE encryptions of 0 under sk2.
|
||||
* bsk is thus a tensor of size (k+1)^2.l.N.n
|
||||
* where l is the number of decomposition levels and
|
||||
* k is the GLWE dimension, N is the polynomial size for
|
||||
* GLWE. The polynomial size for GLWE and the test vector
|
||||
* are the same because they have to be in the same ring
|
||||
* to be multiplied.
|
||||
* - lwe_dimension: size of the Torus vector used to encrypt the input
|
||||
* LWE ciphertexts - referred to as n above (~ 600)
|
||||
* - glwe_dimension: size of the polynomial vector used to encrypt the LUT
|
||||
* GLWE ciphertexts - referred to as k above. Only the value 1 is supported for
|
||||
* this parameter.
|
||||
* - polynomial_size: size of the test polynomial (test vector) and size of the
|
||||
* GLWE polynomial (~1024)
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - level_count: number of decomposition levels in the gadget matrix (~4)
|
||||
* - num_samples: number of encrypted input messages
|
||||
* - num_lut_vectors: parameter to set the actual number of test vectors to be
|
||||
* used
|
||||
* - lwe_idx: the index of the LWE input to consider for the GPU of index
|
||||
* gpu_index. In case of multi-GPU computing, it is assumed that only a part of
|
||||
* the input LWE array is copied to each GPU, but the whole LUT array is copied
|
||||
* (because the case when the number of LUTs is smaller than the number of input
|
||||
* LWEs is not trivial to take into account in the data repartition on the
|
||||
* GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
|
||||
* input in the LUT array `lut_vector`.
|
||||
* - 'max_shared_memory' maximum amount of shared memory to be used inside
|
||||
* device functions
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* bootstrapping:
|
||||
* - the kernel is templatized based on integer discretization and
|
||||
* polynomial degree
|
||||
* - num_samples * level_count * (glwe_dimension + 1) blocks of threads are
|
||||
* launched, where each thread is going to handle one or more polynomial
|
||||
* coefficients at each stage, for a given level of decomposition, either for
|
||||
* the LUT mask or its body:
|
||||
* - perform the blind rotation
|
||||
* - round the result
|
||||
* - get the decomposition for the current level
|
||||
* - switch to the FFT domain
|
||||
* - multiply with the bootstrapping key
|
||||
* - come back to the coefficients representation
|
||||
* - between each stage a synchronization of the threads is necessary (some
|
||||
* synchronizations happen at the block level, some happen between blocks, using
|
||||
* cooperative groups).
|
||||
* - in case the device has enough shared memory, temporary arrays used for
|
||||
* the different stages (accumulators) are stored into the shared memory
|
||||
* - the accumulators serve to combine the results for all decomposition
|
||||
* levels
|
||||
* - the constant memory (64K) is used for storing the roots of identity
|
||||
* values for the FFT
|
||||
*/
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
checks_bootstrap_low_latency(64, glwe_dimension, level_count, base_log,
|
||||
polynomial_size, num_samples);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
|
||||
AmortizedDegree<256>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory))
|
||||
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<256>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
else
|
||||
host_bootstrap_low_latency<uint64_t, Degree<256>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
|
||||
AmortizedDegree<512>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory))
|
||||
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<512>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
else
|
||||
host_bootstrap_low_latency<uint64_t, Degree<512>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<1024>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory))
|
||||
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<1024>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
else
|
||||
host_bootstrap_low_latency<uint64_t, Degree<1024>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<2048>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory))
|
||||
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<2048>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
else
|
||||
host_bootstrap_low_latency<uint64_t, Degree<2048>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<4096>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory))
|
||||
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<4096>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
else
|
||||
host_bootstrap_low_latency<uint64_t, Degree<4096>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
|
||||
AmortizedDegree<8192>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory))
|
||||
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<8192>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
else
|
||||
host_bootstrap_low_latency<uint64_t, Degree<8192>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
|
||||
uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
|
||||
num_samples, max_shared_memory))
|
||||
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<16384>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
else
|
||||
host_bootstrap_low_latency<uint64_t, Degree<16384>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_lut_vectors, max_shared_memory);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This cleanup function frees the data for the low latency PBS on GPU in
|
||||
* pbs_buffer for 32 or 64 bits inputs.
|
||||
*/
|
||||
void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer) {
|
||||
// Free memory
|
||||
cuda_drop_async(*pbs_buffer, stream);
|
||||
}
|
||||
@@ -0,0 +1,487 @@
|
||||
#ifndef CUDA_LOWLAT_PBS_CUH
|
||||
#define CUDA_LOWLAT_PBS_CUH
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_bootstrap_low_latency_step_one(
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block) {
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory;
|
||||
uint32_t glwe_dimension = gridDim.y - 1;
|
||||
|
||||
if constexpr (SMD == FULLSM) {
|
||||
selected_memory = sharedmem;
|
||||
} else {
|
||||
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
|
||||
blockIdx.z * gridDim.x * gridDim.y;
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
}
|
||||
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
double2 *accumulator_fft =
|
||||
(double2 *)accumulator +
|
||||
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
|
||||
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
accumulator_fft = (double2 *)sharedmem;
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
|
||||
|
||||
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
|
||||
|
||||
double2 *global_fft_slice =
|
||||
global_accumulator_fft +
|
||||
(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
|
||||
blockIdx.z * level_count * (glwe_dimension + 1)) *
|
||||
(polynomial_size / 2);
|
||||
|
||||
if (lwe_iteration == 0) {
|
||||
// First iteration
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree);
|
||||
// The y-dimension is used to select the element of the GLWE this block will
|
||||
// compute
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
|
||||
// Persist
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
global_slice[tid] = accumulator[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_iteration], a_hat,
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(global_slice,
|
||||
accumulator, a_hat);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
global_fft_slice[tid] = accumulator_fft[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_bootstrap_low_latency_step_two(
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, double2 *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block) {
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory;
|
||||
uint32_t glwe_dimension = gridDim.y - 1;
|
||||
|
||||
if constexpr (SMD == FULLSM) {
|
||||
selected_memory = sharedmem;
|
||||
} else {
|
||||
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
|
||||
blockIdx.z * gridDim.x * gridDim.y;
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
}
|
||||
|
||||
// We always compute the pointer with most restrictive alignment to avoid
|
||||
// alignment issues
|
||||
double2 *accumulator_fft = (double2 *)selected_memory;
|
||||
Torus *accumulator =
|
||||
(Torus *)accumulator_fft +
|
||||
(ptrdiff_t)(sizeof(double2) * params::degree / 2 / sizeof(Torus));
|
||||
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
accumulator_fft = (double2 *)sharedmem;
|
||||
|
||||
for (int level = 0; level < level_count; level++) {
|
||||
double2 *global_fft_slice = global_accumulator_fft +
|
||||
(level + blockIdx.x * level_count) *
|
||||
(glwe_dimension + 1) * (params::degree / 2);
|
||||
|
||||
for (int j = 0; j < (glwe_dimension + 1); j++) {
|
||||
double2 *fft = global_fft_slice + j * params::degree / 2;
|
||||
|
||||
// Get the bootstrapping key piece necessary for the multiplication
|
||||
// It is already in the Fourier domain
|
||||
auto bsk_slice =
|
||||
get_ith_mask_kth_block(bootstrapping_key, lwe_iteration, j, level,
|
||||
polynomial_size, glwe_dimension, level_count);
|
||||
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
|
||||
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
accumulator_fft, fft, bsk_poly, !level && !j);
|
||||
}
|
||||
}
|
||||
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
|
||||
|
||||
// Load the persisted accumulator
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator[tid] = global_slice[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
|
||||
// accumulator
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator);
|
||||
|
||||
if (lwe_iteration + 1 == lwe_dimension) {
|
||||
// Last iteration
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.x] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
if (blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
}
|
||||
} else {
|
||||
// Persist the updated accumulator
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
global_slice[tid] = accumulator[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_one(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_two(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_bootstrap_low_latency(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
uint64_t full_sm_step_one =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_step_two =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
|
||||
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
|
||||
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
|
||||
uint64_t full_dm = full_sm_step_one;
|
||||
|
||||
uint64_t device_mem = 0;
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm_step_two) {
|
||||
device_mem = (partial_dm_step_two + partial_dm_step_one * level_count) *
|
||||
input_lwe_ciphertext_count * (glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm_step_one) {
|
||||
device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
|
||||
level_count * (glwe_dimension + 1);
|
||||
}
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
uint64_t buffer_size = device_mem +
|
||||
// global_accumulator_fft
|
||||
(glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count *
|
||||
(polynomial_size / 2) * sizeof(double2) +
|
||||
// global_accumulator
|
||||
(glwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
polynomial_size * sizeof(Torus);
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_bootstrap_low_latency(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
uint64_t full_sm_step_one =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_step_two =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
|
||||
// Configure step one
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency_step_one<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_bootstrap_low_latency_step_one<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
// Configure step two
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency_step_two<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_bootstrap_low_latency_step_two<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
uint64_t buffer_size = get_buffer_size_bootstrap_low_latency<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_low_latency_step_one(
|
||||
cuda_stream_t *stream, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *d_mem, uint32_t max_shared_memory,
|
||||
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
|
||||
uint64_t full_sm, uint64_t full_dm) {
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_bootstrap_low_latency_step_one<Torus, params, NOSM>
|
||||
<<<grid, thds, 0, stream->stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, full_dm);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, partial_sm, stream->stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, partial_dm);
|
||||
} else {
|
||||
device_bootstrap_low_latency_step_one<Torus, params, FULLSM>
|
||||
<<<grid, thds, full_sm, stream->stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, 0);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_low_latency_step_two(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, double2 *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *d_mem, uint32_t max_shared_memory,
|
||||
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
|
||||
uint64_t full_sm, uint64_t full_dm) {
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_bootstrap_low_latency_step_two<Torus, params, NOSM>
|
||||
<<<grid, thds, 0, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, full_dm);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, partial_sm, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, partial_dm);
|
||||
} else {
|
||||
device_bootstrap_low_latency_step_two<Torus, params, FULLSM>
|
||||
<<<grid, thds, full_sm, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, 0);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
/*
|
||||
* Host wrapper to the low latency version
|
||||
* of bootstrapping
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_low_latency(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
|
||||
uint32_t max_shared_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
uint64_t full_sm_step_one =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_step_two =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
|
||||
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
|
||||
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
|
||||
uint64_t full_dm_step_one = full_sm_step_one;
|
||||
uint64_t full_dm_step_two = full_sm_step_two;
|
||||
|
||||
double2 *global_accumulator_fft = (double2 *)pbs_buffer;
|
||||
Torus *global_accumulator =
|
||||
(Torus *)global_accumulator_fft +
|
||||
(ptrdiff_t)(sizeof(double2) * (glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count * (polynomial_size / 2) /
|
||||
sizeof(Torus));
|
||||
int8_t *d_mem = (int8_t *)global_accumulator +
|
||||
(ptrdiff_t)(sizeof(Torus) * (glwe_dimension + 1) *
|
||||
input_lwe_ciphertext_count * polynomial_size /
|
||||
sizeof(int8_t));
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
execute_low_latency_step_one<Torus, params>(
|
||||
stream, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
input_lwe_ciphertext_count, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, d_mem, max_shared_memory, i,
|
||||
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
|
||||
execute_low_latency_step_two<Torus, params>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, bootstrapping_key, global_accumulator,
|
||||
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
max_shared_memory, i, partial_sm, partial_dm_step_two, full_sm_step_two,
|
||||
full_dm_step_two);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // LOWLAT_PBS_H
|
||||
@@ -0,0 +1,485 @@
|
||||
#include "../polynomial/parameters.cuh"
|
||||
#include "bootstrap_fast_multibit.cuh"
|
||||
#include "bootstrap_multibit.cuh"
|
||||
#include "bootstrap_multibit.h"
|
||||
|
||||
void checks_multi_bit_pbs(int polynomial_size) {
|
||||
assert(
|
||||
("Error (GPU multi-bit PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192, 16384",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192 ||
|
||||
polynomial_size == 16384));
|
||||
}
|
||||
|
||||
void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
|
||||
|
||||
checks_multi_bit_pbs(polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<256>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory)) {
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
} else {
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<512>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory)) {
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
} else {
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 1024:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory)) {
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
} else {
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 2048:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory)) {
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
} else {
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 4096:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory)) {
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
} else {
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 8192:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory)) {
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
} else {
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 16384:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory)) {
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
} else {
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void scratch_cuda_multi_bit_pbs_64(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<256>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory)) {
|
||||
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<512>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory)) {
|
||||
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 1024:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory)) {
|
||||
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 2048:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory)) {
|
||||
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 4096:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory)) {
|
||||
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 8192:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory)) {
|
||||
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 16384:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory)) {
|
||||
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer) {
|
||||
|
||||
// Free memory
|
||||
cuda_drop_async(*pbs_buffer, stream);
|
||||
}
|
||||
|
||||
// Pick the best possible chunk size for each GPU
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
|
||||
uint32_t level_count,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t num_samples) {
|
||||
|
||||
cudaDeviceProp deviceProp;
|
||||
cudaGetDeviceProperties(&deviceProp, 0); // Assuming device 0
|
||||
|
||||
const char *v100Name = "V100"; // Known name of V100 GPU
|
||||
const char *a100Name = "A100"; // Known name of A100 GPU
|
||||
const char *h100Name = "H100"; // Known name of H100 GPU
|
||||
|
||||
if (std::strstr(deviceProp.name, v100Name) != nullptr) {
|
||||
// Tesla V100
|
||||
if (num_samples == 1)
|
||||
return 60;
|
||||
else if (num_samples == 2)
|
||||
return 40;
|
||||
else if (num_samples <= 4)
|
||||
return 20;
|
||||
else if (num_samples <= 8)
|
||||
return 10;
|
||||
else if (num_samples <= 16)
|
||||
return 40;
|
||||
else if (num_samples <= 32)
|
||||
return 27;
|
||||
else if (num_samples <= 64)
|
||||
return 20;
|
||||
else if (num_samples <= 128)
|
||||
return 18;
|
||||
else if (num_samples <= 256)
|
||||
return 16;
|
||||
else if (num_samples <= 512)
|
||||
return 15;
|
||||
else if (num_samples <= 1024)
|
||||
return 15;
|
||||
else
|
||||
return 12;
|
||||
} else if (std::strstr(deviceProp.name, a100Name) != nullptr) {
|
||||
// Tesla A100
|
||||
if (num_samples < 4)
|
||||
return 11;
|
||||
else if (num_samples < 8)
|
||||
return 6;
|
||||
else if (num_samples < 16)
|
||||
return 13;
|
||||
else if (num_samples < 64)
|
||||
return 19;
|
||||
else if (num_samples < 128)
|
||||
return 1;
|
||||
else if (num_samples < 512)
|
||||
return 19;
|
||||
else if (num_samples < 1024)
|
||||
return 17;
|
||||
else if (num_samples < 8192)
|
||||
return 19;
|
||||
else if (num_samples < 16384)
|
||||
return 12;
|
||||
else
|
||||
return 9;
|
||||
} else if (std::strstr(deviceProp.name, h100Name) != nullptr) {
|
||||
// Tesla H100
|
||||
return 45;
|
||||
}
|
||||
|
||||
// Generic case
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Returns a chunk size that is not optimal but close to
|
||||
__host__ uint32_t get_average_lwe_chunk_size(uint32_t lwe_dimension,
|
||||
uint32_t level_count,
|
||||
uint32_t glwe_dimension) {
|
||||
|
||||
cudaDeviceProp deviceProp;
|
||||
cudaGetDeviceProperties(&deviceProp, 0); // Assuming device 0
|
||||
|
||||
const char *v100Name = "V100"; // Known name of V100 GPU
|
||||
const char *a100Name = "A100"; // Known name of A100 GPU
|
||||
const char *h100Name = "H100"; // Known name of H100 GPU
|
||||
|
||||
if (std::strstr(deviceProp.name, v100Name) != nullptr) {
|
||||
// Tesla V100
|
||||
return 18;
|
||||
} else if (std::strstr(deviceProp.name, a100Name) != nullptr) {
|
||||
// Tesla A100
|
||||
return 45;
|
||||
} else if (std::strstr(deviceProp.name, h100Name) != nullptr) {
|
||||
// Tesla H100
|
||||
return 45;
|
||||
}
|
||||
|
||||
// Generic case
|
||||
return 10;
|
||||
}
|
||||
|
||||
// Returns the maximum buffer size required to execute batches up to
|
||||
// max_input_lwe_ciphertext_count
|
||||
// todo: Deprecate this function
|
||||
__host__ uint64_t get_max_buffer_size_multibit_bootstrap(
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_input_lwe_ciphertext_count) {
|
||||
|
||||
uint64_t max_buffer_size = 0;
|
||||
for (uint32_t input_lwe_ciphertext_count = 1;
|
||||
input_lwe_ciphertext_count <= max_input_lwe_ciphertext_count;
|
||||
input_lwe_ciphertext_count *= 2) {
|
||||
max_buffer_size = std::max(
|
||||
max_buffer_size,
|
||||
get_buffer_size_multibit_bootstrap<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count,
|
||||
get_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
|
||||
input_lwe_ciphertext_count)));
|
||||
}
|
||||
|
||||
return max_buffer_size;
|
||||
}
|
||||
@@ -0,0 +1,476 @@
|
||||
#ifndef CUDA_MULTIBIT_PBS_CUH
|
||||
#define CUDA_MULTIBIT_PBS_CUH
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "bootstrap_fast_low_latency.cuh"
|
||||
#include "bootstrap_multibit.h"
|
||||
#include "cooperative_groups.h"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/ggsw.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ Torus calculates_monomial_degree(Torus *lwe_array_group,
|
||||
uint32_t ggsw_idx,
|
||||
uint32_t grouping_factor) {
|
||||
Torus x = 0;
|
||||
for (int i = 0; i < grouping_factor; i++) {
|
||||
uint32_t mask_position = grouping_factor - (i + 1);
|
||||
int selection_bit = (ggsw_idx >> mask_position) & 1;
|
||||
x += selection_bit * lwe_array_group[i];
|
||||
}
|
||||
|
||||
return rescale_torus_element(
|
||||
x, 2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void device_multi_bit_bootstrap_keybundle(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *keybundle_array,
|
||||
Torus *bootstrapping_key, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size,
|
||||
uint32_t keybundle_size_per_input) {
|
||||
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory = sharedmem;
|
||||
|
||||
// Ids
|
||||
uint32_t level_id = blockIdx.z;
|
||||
uint32_t glwe_id = blockIdx.y / (glwe_dimension + 1);
|
||||
uint32_t poly_id = blockIdx.y % (glwe_dimension + 1);
|
||||
uint32_t lwe_iteration = (blockIdx.x % lwe_chunk_size + lwe_offset);
|
||||
uint32_t input_idx = blockIdx.x / lwe_chunk_size;
|
||||
|
||||
if (lwe_iteration < (lwe_dimension / grouping_factor)) {
|
||||
//
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
|
||||
Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
|
||||
|
||||
double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
input_idx * keybundle_size_per_input;
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Computes all keybundles
|
||||
uint32_t rev_lwe_iteration =
|
||||
((lwe_dimension / grouping_factor) - lwe_iteration - 1);
|
||||
|
||||
// ////////////////////////////////
|
||||
// Keygen guarantees the first term is a constant term of the polynomial, no
|
||||
// polynomial multiplication required
|
||||
Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
|
||||
grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
|
||||
Torus *bsk_poly = bsk_slice + poly_id * params::degree;
|
||||
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
bsk_poly, accumulator);
|
||||
|
||||
// Accumulate the other terms
|
||||
for (int g = 1; g < (1 << grouping_factor); g++) {
|
||||
|
||||
Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
|
||||
grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
|
||||
Torus *bsk_poly = bsk_slice + poly_id * params::degree;
|
||||
|
||||
// Calculates the monomial degree
|
||||
Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, g, grouping_factor);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
// Multiply by the bsk element
|
||||
polynomial_product_accumulate_by_monomial<Torus, params>(
|
||||
accumulator, bsk_poly, monomial_degree, false);
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
double2 *fft = (double2 *)sharedmem;
|
||||
|
||||
// Move accumulator to local memory
|
||||
double2 temp[params::opt / 2];
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
|
||||
temp[i].y =
|
||||
__ll2double_rn((int64_t)accumulator[tid + params::degree / 2]);
|
||||
temp[i].x /= (double)std::numeric_limits<Torus>::max();
|
||||
temp[i].y /= (double)std::numeric_limits<Torus>::max();
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
// Move from local memory back to shared memory but as complex
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = temp[i];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
|
||||
// lwe iteration
|
||||
auto keybundle_out = get_ith_mask_kth_block(
|
||||
keybundle, blockIdx.x % lwe_chunk_size, glwe_id, level_id,
|
||||
polynomial_size, glwe_dimension, level_count);
|
||||
auto keybundle_poly = keybundle_out + poly_id * params::degree / 2;
|
||||
|
||||
copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
|
||||
fft, keybundle_poly);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void device_multi_bit_bootstrap_accumulate_step_one(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *global_accumulator,
|
||||
double2 *global_accumulator_fft, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t lwe_iteration) {
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory;
|
||||
|
||||
selected_memory = sharedmem;
|
||||
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
double2 *accumulator_fft =
|
||||
(double2 *)accumulator +
|
||||
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
|
||||
|
||||
Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
|
||||
|
||||
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
|
||||
|
||||
double2 *global_fft_slice =
|
||||
global_accumulator_fft +
|
||||
(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
|
||||
blockIdx.z * level_count * (glwe_dimension + 1)) *
|
||||
(polynomial_size / 2);
|
||||
|
||||
if (lwe_iteration == 0) {
|
||||
// First iteration
|
||||
////////////////////////////////////////////////////////////
|
||||
// Initializes the accumulator with the body of LWE
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
|
||||
// Persist
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, global_slice);
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
global_slice, accumulator);
|
||||
}
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
gadget_acc.decompose_and_compress_next_polynomial(accumulator_fft,
|
||||
blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
|
||||
copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
|
||||
accumulator_fft, global_fft_slice);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void device_multi_bit_bootstrap_accumulate_step_two(
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes, double2 *keybundle_array,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor, uint32_t iteration,
|
||||
uint32_t lwe_offset, uint32_t lwe_chunk_size) {
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory;
|
||||
|
||||
selected_memory = sharedmem;
|
||||
double2 *accumulator_fft = (double2 *)selected_memory;
|
||||
|
||||
double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
blockIdx.x * lwe_chunk_size * level_count *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2);
|
||||
|
||||
double2 *global_accumulator_fft_input =
|
||||
global_accumulator_fft +
|
||||
blockIdx.x * level_count * (glwe_dimension + 1) * (polynomial_size / 2);
|
||||
|
||||
for (int level = 0; level < level_count; level++) {
|
||||
double2 *global_fft_slice =
|
||||
global_accumulator_fft_input +
|
||||
level * (glwe_dimension + 1) * (polynomial_size / 2);
|
||||
|
||||
for (int j = 0; j < (glwe_dimension + 1); j++) {
|
||||
double2 *fft = global_fft_slice + j * params::degree / 2;
|
||||
|
||||
// Get the bootstrapping key piece necessary for the multiplication
|
||||
// It is already in the Fourier domain
|
||||
auto bsk_slice =
|
||||
get_ith_mask_kth_block(keybundle, iteration, j, level,
|
||||
polynomial_size, glwe_dimension, level_count);
|
||||
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
|
||||
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
accumulator_fft, fft, bsk_poly, !level && !j);
|
||||
}
|
||||
}
|
||||
|
||||
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
|
||||
// accumulator
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, global_slice, true);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
uint32_t lwe_iteration = iteration + lwe_offset;
|
||||
if (lwe_iteration + 1 == (lwe_dimension / grouping_factor)) {
|
||||
// Last iteration
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.x] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
if (blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice);
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_bootstrap_keybundle(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // accumulator
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_bootstrap_step_one(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size * 2; // accumulator
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_bootstrap_step_two(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // accumulator
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_multibit_bootstrap(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size) {
|
||||
|
||||
uint64_t buffer_size = 0;
|
||||
buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2) * sizeof(double2); // keybundle fft
|
||||
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
|
||||
level_count * (polynomial_size / 2) *
|
||||
sizeof(double2); // global_accumulator_fft
|
||||
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
|
||||
polynomial_size * sizeof(Torus); // global_accumulator
|
||||
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void
|
||||
scratch_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t grouping_factor, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_accumulate_step_one =
|
||||
get_buffer_size_full_sm_multibit_bootstrap_step_one<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_accumulate_step_two =
|
||||
get_buffer_size_full_sm_multibit_bootstrap_step_two<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_bootstrap_keybundle<Torus, params>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
|
||||
cudaFuncSetCacheConfig(device_multi_bit_bootstrap_keybundle<Torus, params>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_bootstrap_accumulate_step_one<Torus, params>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
full_sm_accumulate_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_bootstrap_accumulate_step_one<Torus, params>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_bootstrap_accumulate_step_two<Torus, params>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
full_sm_accumulate_step_two));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_bootstrap_accumulate_step_two<Torus, params>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_average_lwe_chunk_size(lwe_dimension, level_count,
|
||||
glwe_dimension);
|
||||
|
||||
uint64_t buffer_size = get_buffer_size_multibit_bootstrap<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
__host__ void host_multi_bit_pbs(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, uint64_t *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// If a chunk size is not passed to this function, select one.
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size =
|
||||
get_average_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension);
|
||||
//
|
||||
double2 *keybundle_fft = (double2 *)pbs_buffer;
|
||||
double2 *global_accumulator_fft =
|
||||
(double2 *)keybundle_fft +
|
||||
num_samples * lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2);
|
||||
Torus *global_accumulator =
|
||||
(Torus *)global_accumulator_fft +
|
||||
(ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
|
||||
level_count * (polynomial_size / 2) / sizeof(Torus));
|
||||
|
||||
//
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_accumulate_step_one =
|
||||
get_buffer_size_full_sm_multibit_bootstrap_step_one<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_accumulate_step_two =
|
||||
get_buffer_size_full_sm_multibit_bootstrap_step_two<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
uint32_t keybundle_size_per_input =
|
||||
lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2);
|
||||
|
||||
//
|
||||
dim3 grid_accumulate_step_one(level_count, glwe_dimension + 1, num_samples);
|
||||
dim3 grid_accumulate_step_two(num_samples, glwe_dimension + 1);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
uint32_t chunk_size = std::min(
|
||||
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
|
||||
// Compute a keybundle
|
||||
dim3 grid_keybundle(num_samples * chunk_size,
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1),
|
||||
level_count);
|
||||
device_multi_bit_bootstrap_keybundle<Torus, params>
|
||||
<<<grid_keybundle, thds, full_sm_keybundle, stream->stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, lwe_offset, chunk_size,
|
||||
keybundle_size_per_input);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// Accumulate
|
||||
for (int j = 0; j < chunk_size; j++) {
|
||||
device_multi_bit_bootstrap_accumulate_step_one<Torus, params>
|
||||
<<<grid_accumulate_step_one, thds, full_sm_accumulate_step_one,
|
||||
stream->stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
|
||||
lut_vector_indexes, global_accumulator,
|
||||
global_accumulator_fft, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log,
|
||||
level_count, j + lwe_offset);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
device_multi_bit_bootstrap_accumulate_step_two<Torus, params>
|
||||
<<<grid_accumulate_step_two, thds, full_sm_accumulate_step_two,
|
||||
stream->stream>>>(lwe_array_out, lwe_output_indexes, keybundle_fft,
|
||||
global_accumulator, global_accumulator_fft,
|
||||
lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, grouping_factor, j, lwe_offset,
|
||||
lwe_chunk_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // MULTIBIT_PBS_H
|
||||
@@ -0,0 +1,500 @@
|
||||
#ifndef CUDA_BSK_CUH
|
||||
#define CUDA_BSK_CUH
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "bootstrap_multibit.h"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count) {
|
||||
return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
level_count;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////
|
||||
template <typename T>
|
||||
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t level_count) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
|
||||
level_count) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1)];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t level_count) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
|
||||
level_count) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1) +
|
||||
glwe_dimension * polynomial_size / 2];
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////
|
||||
__device__ inline int get_start_ith_lwe(uint32_t i, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t level_count) {
|
||||
return i * (1 << grouping_factor) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) * level_count;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count) {
|
||||
T *ptr_group = ptr + get_start_ith_lwe(i, grouping_factor, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
return get_ith_mask_kth_block(ptr_group, g, k, level, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
}
|
||||
////////////////////////////////////////////////
|
||||
template <typename T, typename ST>
|
||||
void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim,
|
||||
uint32_t level_count,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t total_polynomials) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
int shared_memory_size = sizeof(double) * polynomial_size;
|
||||
|
||||
// Here the buffer size is the size of double2 times the number of polynomials
|
||||
// times the polynomial size over 2 because the polynomials are compressed
|
||||
// into the complex domain to perform the FFT
|
||||
size_t buffer_size =
|
||||
total_polynomials * polynomial_size / 2 * sizeof(double2);
|
||||
|
||||
int gridSize = total_polynomials;
|
||||
int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
|
||||
|
||||
double2 *h_bsk = (double2 *)malloc(buffer_size);
|
||||
|
||||
double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream);
|
||||
|
||||
// compress real bsk to complex and divide it on DOUBLE_MAX
|
||||
for (int i = 0; i < total_polynomials; i++) {
|
||||
int complex_current_poly_idx = i * polynomial_size / 2;
|
||||
int torus_current_poly_idx = i * polynomial_size;
|
||||
for (int j = 0; j < polynomial_size / 2; j++) {
|
||||
h_bsk[complex_current_poly_idx + j].x = src[torus_current_poly_idx + j];
|
||||
h_bsk[complex_current_poly_idx + j].y =
|
||||
src[torus_current_poly_idx + j + polynomial_size / 2];
|
||||
h_bsk[complex_current_poly_idx + j].x /=
|
||||
(double)std::numeric_limits<T>::max();
|
||||
h_bsk[complex_current_poly_idx + j].y /=
|
||||
(double)std::numeric_limits<T>::max();
|
||||
}
|
||||
}
|
||||
|
||||
cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream);
|
||||
|
||||
double2 *buffer;
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
d_bsk, dest, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
d_bsk, dest, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 1024:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
d_bsk, dest, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 2048:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
d_bsk, dest, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 4096:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
d_bsk, dest, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 8192:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
d_bsk, dest, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 16384:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
d_bsk, dest, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
cuda_drop_async(d_bsk, stream);
|
||||
cuda_drop_async(buffer, stream);
|
||||
free(h_bsk);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size) {
|
||||
uint32_t total_polynomials =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
|
||||
cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>(
|
||||
(double2 *)dest, (int32_t *)src, stream, input_lwe_dim, glwe_dim,
|
||||
level_count, polynomial_size, total_polynomials);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size) {
|
||||
uint32_t total_polynomials =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
|
||||
cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>(
|
||||
(double2 *)dest, (int64_t *)src, stream, input_lwe_dim, glwe_dim,
|
||||
level_count, polynomial_size, total_polynomials);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_multi_bit_bootstrap_key_64(
|
||||
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor) {
|
||||
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
|
||||
level_count * (1 << grouping_factor) /
|
||||
grouping_factor;
|
||||
size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t);
|
||||
|
||||
cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size,
|
||||
stream);
|
||||
}
|
||||
|
||||
void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t total_polynomials) {
|
||||
|
||||
auto input1 = (double2 *)_input1;
|
||||
auto input2 = (double2 *)_input2;
|
||||
auto output = (double2 *)_output;
|
||||
|
||||
size_t shared_memory_size = sizeof(double2) * polynomial_size / 2;
|
||||
|
||||
int gridSize = total_polynomials;
|
||||
int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
|
||||
|
||||
double2 *buffer;
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
input1, input2, output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
|
||||
buffer);
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
input1, input2, output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
|
||||
buffer);
|
||||
}
|
||||
break;
|
||||
case 1024:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
input1, input2, output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
|
||||
buffer);
|
||||
}
|
||||
break;
|
||||
case 2048:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
input1, input2, output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
|
||||
buffer);
|
||||
}
|
||||
break;
|
||||
case 4096:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
input1, input2, output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
|
||||
buffer);
|
||||
}
|
||||
break;
|
||||
case 8192:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
input1, input2, output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
|
||||
buffer);
|
||||
}
|
||||
break;
|
||||
case 16384:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
input1, input2, output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
|
||||
buffer);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
cuda_drop_async(buffer, stream);
|
||||
}
|
||||
|
||||
// We need these lines so the compiler knows how to specialize these functions
|
||||
template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
|
||||
int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
|
||||
int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
|
||||
|
||||
template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
|
||||
#endif // CNCRT_BSK_H
|
||||
@@ -0,0 +1,305 @@
|
||||
#ifndef GPU_POLYNOMIAL_FUNCTIONS_CUH
|
||||
#define GPU_POLYNOMIAL_FUNCTIONS_CUH
|
||||
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
|
||||
// Return A if C == 0 and B if C == 1
|
||||
#define SEL(A, B, C) ((-(C) & ((A) ^ (B))) ^ (A))
|
||||
|
||||
/*
|
||||
* function compresses decomposed buffer into half size complex buffer for fft
|
||||
*/
|
||||
template <class params>
|
||||
__device__ void real_to_complex_compressed(int16_t *src, double2 *dst) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
dst[tid].x = __int2double_rn(src[2 * tid]);
|
||||
dst[tid].y = __int2double_rn(src[2 * tid + 1]);
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* copy source polynomial to specific slice of batched polynomials
|
||||
* used only in low latency version
|
||||
*/
|
||||
template <typename T, class params>
|
||||
__device__ void copy_into_ith_polynomial_low_lat(T *source, T *dst, int i) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = i * (params::degree / 2 + 1);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
dst[tid + begin] = source[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
dst[params::degree / 2 + begin] = source[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void copy_polynomial(T *source, T *dst) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
dst[tid] = source[tid];
|
||||
tid = tid + block_size;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* accumulates source polynomial into specific slice of batched polynomial
|
||||
* used only in low latency version
|
||||
*/
|
||||
template <typename T, class params>
|
||||
__device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = p_id * (params::degree / 2 + 1);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
dst[tid] += source[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
dst[params::degree / 2] += source[params::degree / 2 + begin];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Receives num_poly concatenated polynomials of type T. For each:
|
||||
*
|
||||
* Performs acc = acc * (X^ä + 1) if zeroAcc = false
|
||||
* Performs acc = 0 if zeroAcc
|
||||
* takes single buffer and calculates inplace.
|
||||
*
|
||||
* By default, it works on a single polynomial.
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
|
||||
uint32_t j, bool zeroAcc,
|
||||
uint32_t num_poly = 1) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
|
||||
T *input_slice = (T *)input + (ptrdiff_t)(z * degree);
|
||||
|
||||
int tid = threadIdx.x;
|
||||
if (zeroAcc) {
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
accumulator_slice[tid] = 0;
|
||||
tid += block_size;
|
||||
}
|
||||
} else {
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
if (j < degree) {
|
||||
// if (tid < degree - j)
|
||||
// accumulator_slice[tid] = input_slice[tid + j];
|
||||
// else
|
||||
// accumulator_slice[tid] = -input_slice[tid - degree + j];
|
||||
int x = tid + j - SEL(degree, 0, tid < degree - j);
|
||||
accumulator_slice[tid] =
|
||||
SEL(-1, 1, tid < degree - j) * input_slice[x];
|
||||
} else {
|
||||
int32_t jj = j - degree;
|
||||
// if (tid < degree - jj)
|
||||
// accumulator_slice[tid] = -input_slice[tid + jj];
|
||||
// else
|
||||
// accumulator_slice[tid] = input_slice[tid - degree + jj];
|
||||
int x = tid + jj - SEL(degree, 0, tid < degree - jj);
|
||||
accumulator_slice[tid] =
|
||||
SEL(1, -1, tid < degree - jj) * input_slice[x];
|
||||
}
|
||||
tid += block_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Receives num_poly concatenated polynomials of type T. For each:
|
||||
*
|
||||
* Performs result_acc = acc * (X^ä - 1) - acc
|
||||
* takes single buffer as input and returns a single rotated buffer
|
||||
*
|
||||
* By default, it works on a single polynomial.
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
|
||||
T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
|
||||
T *result_acc_slice = (T *)result_acc + (ptrdiff_t)(z * degree);
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
if (j < degree) {
|
||||
// if (tid < j)
|
||||
// result_acc_slice[tid] = -acc_slice[tid - j + degree]-acc_slice[tid];
|
||||
// else
|
||||
// result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid];
|
||||
int x = tid - j + SEL(0, degree, tid < j);
|
||||
result_acc_slice[tid] =
|
||||
SEL(1, -1, tid < j) * acc_slice[x] - acc_slice[tid];
|
||||
} else {
|
||||
int32_t jj = j - degree;
|
||||
// if (tid < jj)
|
||||
// result_acc_slice[tid] = acc_slice[tid - jj + degree]-acc_slice[tid];
|
||||
// else
|
||||
// result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid];
|
||||
int x = tid - jj + SEL(0, degree, tid < jj);
|
||||
result_acc_slice[tid] =
|
||||
SEL(-1, 1, tid < jj) * acc_slice[x] - acc_slice[tid];
|
||||
}
|
||||
tid += block_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Receives num_poly concatenated polynomials of type T. For each performs a
|
||||
* rounding to increase accuracy of the PBS. Calculates inplace.
|
||||
*
|
||||
* By default, it works on a single polynomial.
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
|
||||
int level_count,
|
||||
uint32_t num_poly = 1) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
T x_acc = rotated_acc_slice[tid];
|
||||
T shift = sizeof(T) * 8 - level_count * base_log;
|
||||
T mask = 1ll << (shift - 1);
|
||||
T b_acc = (x_acc & mask) >> (shift - 1);
|
||||
T res_acc = x_acc >> shift;
|
||||
res_acc += b_acc;
|
||||
res_acc <<= shift;
|
||||
rotated_acc_slice[tid] = res_acc;
|
||||
tid = tid + block_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ void add_to_torus(double2 *m_values, Torus *result,
|
||||
bool init_torus = false) {
|
||||
Torus mx = (sizeof(Torus) == 4) ? UINT32_MAX : UINT64_MAX;
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
double v1 = m_values[tid].x;
|
||||
double v2 = m_values[tid].y;
|
||||
|
||||
double frac = v1 - floor(v1);
|
||||
frac *= mx;
|
||||
double carry = frac - floor(frac);
|
||||
frac += (carry >= 0.5);
|
||||
|
||||
Torus V1 = 0;
|
||||
typecast_double_to_torus<Torus>(frac, V1);
|
||||
|
||||
frac = v2 - floor(v2);
|
||||
frac *= mx;
|
||||
carry = frac - floor(v2);
|
||||
frac += (carry >= 0.5);
|
||||
|
||||
Torus V2 = 0;
|
||||
typecast_double_to_torus<Torus>(frac, V2);
|
||||
|
||||
if (init_torus) {
|
||||
result[tid] = V1;
|
||||
result[tid + params::degree / 2] = V2;
|
||||
} else {
|
||||
result[tid] += V1;
|
||||
result[tid + params::degree / 2] += V2;
|
||||
}
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
// Extracts the body of a GLWE.
|
||||
// k is the offset to find the body element / polynomial in the lwe_array_out /
|
||||
// accumulator
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
|
||||
uint32_t k) {
|
||||
// Set first coefficient of the accumulator as the body of the LWE sample
|
||||
lwe_array_out[k * params::degree] = accumulator[k * params::degree];
|
||||
}
|
||||
|
||||
// Extracts the mask from num_poly polynomials individually
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
|
||||
uint32_t num_poly = 1) {
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
Torus *lwe_array_out_slice =
|
||||
(Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
|
||||
Torus *accumulator_slice =
|
||||
(Torus *)accumulator + (ptrdiff_t)(z * params::degree);
|
||||
|
||||
// Set ACC = -ACC
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator_slice[tid] = -accumulator_slice[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Reverse the accumulator
|
||||
tid = threadIdx.x;
|
||||
Torus result[params::opt];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
result[i] = accumulator_slice[params::degree - tid - 1];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator_slice[tid] = result[i];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform ACC * X
|
||||
// (equivalent to multiply_by_monomial_negacyclic_inplace(1))
|
||||
tid = threadIdx.x;
|
||||
result[params::opt];
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
// if (tid < 1)
|
||||
// result[i] = -accumulator_slice[tid - 1 + params::degree];
|
||||
// else
|
||||
// result[i] = accumulator_slice[tid - 1];
|
||||
int x = tid - 1 + SEL(0, params::degree, tid < 1);
|
||||
result[i] = SEL(1, -1, tid < 1) * accumulator_slice[x];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator_slice[tid] = result[i];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Copy to the mask of the LWE sample
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
lwe_array_out_slice[tid] = accumulator_slice[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,91 @@
|
||||
#ifndef CUDA_PARAMETERS_CUH
|
||||
#define CUDA_PARAMETERS_CUH
|
||||
|
||||
constexpr int log2(int n) { return (n <= 2) ? 1 : 1 + log2(n / 2); }
|
||||
|
||||
constexpr int choose_opt_amortized(int degree) {
|
||||
if (degree <= 1024)
|
||||
return 4;
|
||||
else if (degree == 2048)
|
||||
return 8;
|
||||
else if (degree == 4096)
|
||||
return 16;
|
||||
else if (degree == 8192)
|
||||
return 32;
|
||||
else
|
||||
return 64;
|
||||
}
|
||||
|
||||
constexpr int choose_opt(int degree) {
|
||||
if (degree <= 1024)
|
||||
return 4;
|
||||
else if (degree == 2048)
|
||||
return 4;
|
||||
else if (degree == 4096)
|
||||
return 4;
|
||||
else if (degree == 8192)
|
||||
return 8;
|
||||
else if (degree == 16384)
|
||||
return 16;
|
||||
else
|
||||
return 64;
|
||||
}
|
||||
template <class params> class HalfDegree {
|
||||
public:
|
||||
constexpr static int degree = params::degree / 2;
|
||||
constexpr static int opt = params::opt / 2;
|
||||
constexpr static int log2_degree = params::log2_degree - 1;
|
||||
};
|
||||
|
||||
template <int N> class Degree {
|
||||
public:
|
||||
constexpr static int degree = N;
|
||||
constexpr static int opt = choose_opt(N);
|
||||
constexpr static int log2_degree = log2(N);
|
||||
};
|
||||
|
||||
template <int N> class AmortizedDegree {
|
||||
public:
|
||||
constexpr static int degree = N;
|
||||
constexpr static int opt = choose_opt_amortized(N);
|
||||
constexpr static int log2_degree = log2(N);
|
||||
};
|
||||
enum sharedMemDegree {
|
||||
NOSM = 0,
|
||||
PARTIALSM = 1,
|
||||
FULLSM = 2
|
||||
|
||||
};
|
||||
|
||||
class ForwardFFT {
|
||||
public:
|
||||
constexpr static int direction = 0;
|
||||
};
|
||||
|
||||
class BackwardFFT {
|
||||
public:
|
||||
constexpr static int direction = 1;
|
||||
};
|
||||
|
||||
class ReorderFFT {
|
||||
constexpr static int reorder = 1;
|
||||
};
|
||||
class NoReorderFFT {
|
||||
constexpr static int reorder = 0;
|
||||
};
|
||||
|
||||
template <class params, class direction, class reorder = ReorderFFT>
|
||||
class FFTDegree : public params {
|
||||
public:
|
||||
constexpr static int fft_direction = direction::direction;
|
||||
constexpr static int fft_reorder = reorder::reorder;
|
||||
};
|
||||
|
||||
template <int N, class direction, class reorder = ReorderFFT>
|
||||
class FFTParams : public Degree<N> {
|
||||
public:
|
||||
constexpr static int fft_direction = direction::direction;
|
||||
constexpr static int fft_reorder = reorder::reorder;
|
||||
};
|
||||
|
||||
#endif // CNCRT_PARAMETERS_H
|
||||
@@ -0,0 +1,86 @@
|
||||
#ifndef CUDA_POLYNOMIAL_MATH_CUH
|
||||
#define CUDA_POLYNOMIAL_MATH_CUH
|
||||
|
||||
#include "crypto/torus.cuh"
|
||||
#include "parameters.cuh"
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_chunk(T *data, int chunk_num, int chunk_size) {
|
||||
int pos = chunk_num * chunk_size;
|
||||
T *ptr = &data[pos];
|
||||
return ptr;
|
||||
}
|
||||
|
||||
template <typename FT, class params>
|
||||
__device__ void sub_polynomial(FT *result, FT *first, FT *second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
result[tid] = first[tid] - second[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, typename T>
|
||||
__device__ void polynomial_product_in_fourier_domain(T *result, T *first,
|
||||
T *second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] = first[tid] * second[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
result[params::degree / 2] =
|
||||
first[params::degree / 2] * second[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
// Computes result += first * second
|
||||
// If init_accumulator is set, assumes that result was not initialized and does
|
||||
// that with the outcome of first * second
|
||||
template <class params, typename T>
|
||||
__device__ void
|
||||
polynomial_product_accumulate_in_fourier_domain(T *result, T *first, T *second,
|
||||
bool init_accumulator = false) {
|
||||
int tid = threadIdx.x;
|
||||
if (init_accumulator) {
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] = first[tid] * second[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] += first[tid] * second[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If init_accumulator is set, assumes that result was not initialized and does
|
||||
// that with the outcome of first * second
|
||||
template <typename T, class params>
|
||||
__device__ void
|
||||
polynomial_product_accumulate_by_monomial(T *result, T *poly,
|
||||
uint64_t monomial_degree,
|
||||
bool init_accumulator = false) {
|
||||
// monomial_degree \in [0, 2 * params::degree)
|
||||
int full_cycles_count = monomial_degree / params::degree;
|
||||
int remainder_degrees = monomial_degree % params::degree;
|
||||
|
||||
int pos = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
T element = poly[pos];
|
||||
int new_pos = (pos + monomial_degree) % params::degree;
|
||||
|
||||
T x = SEL(element, -element, full_cycles_count % 2); // monomial coefficient
|
||||
x = SEL(-x, x, new_pos >= remainder_degrees);
|
||||
|
||||
if (init_accumulator)
|
||||
result[new_pos] = x;
|
||||
else
|
||||
result[new_pos] += x;
|
||||
pos += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CNCRT_POLYNOMIAL_MATH_H
|
||||
@@ -0,0 +1,97 @@
|
||||
#ifndef GPU_BOOTSTRAP_COMMON_CUH
|
||||
#define GPU_BOOTSTRAP_COMMON_CUH
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
|
||||
#define SNT 1
|
||||
#define dPI 6.283185307179586231995926937088
|
||||
|
||||
using sTorus = int32_t;
|
||||
// using Torus = uint32_t;
|
||||
using sTorus = int32_t;
|
||||
using u32 = uint32_t;
|
||||
using i32 = int32_t;
|
||||
|
||||
//--------------------------------------------------
|
||||
// Basic double2 operations
|
||||
|
||||
__device__ inline double2 conjugate(const double2 num) {
|
||||
double2 res;
|
||||
res.x = num.x;
|
||||
res.y = -num.y;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline void operator+=(double2 &lh, const double2 rh) {
|
||||
lh.x += rh.x;
|
||||
lh.y += rh.y;
|
||||
}
|
||||
|
||||
__device__ inline void operator-=(double2 &lh, const double2 rh) {
|
||||
lh.x -= rh.x;
|
||||
lh.y -= rh.y;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator+(const double2 a, const double2 b) {
|
||||
double2 res;
|
||||
res.x = a.x + b.x;
|
||||
res.y = a.y + b.y;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator-(const double2 a, const double2 b) {
|
||||
double2 res;
|
||||
res.x = a.x - b.x;
|
||||
res.y = a.y - b.y;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator*(const double2 a, const double2 b) {
|
||||
double xx = a.x * b.x;
|
||||
double xy = a.x * b.y;
|
||||
double yx = a.y * b.x;
|
||||
double yy = a.y * b.y;
|
||||
|
||||
double2 res;
|
||||
// asm volatile("fma.rn.f64 %0, %1, %2, %3;": "=d"(res.x) : "d"(a.x),
|
||||
// "d"(b.x), "d"(yy));
|
||||
res.x = xx - yy;
|
||||
res.y = xy + yx;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator*(const double2 a, double b) {
|
||||
double2 res;
|
||||
res.x = a.x * b;
|
||||
res.y = a.y * b;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline void operator*=(double2 &a, const double2 b) {
|
||||
double tmp = a.x;
|
||||
a.x *= b.x;
|
||||
a.x -= a.y * b.y;
|
||||
a.y *= b.x;
|
||||
a.y += b.y * tmp;
|
||||
}
|
||||
|
||||
__device__ inline void operator*=(double2 &a, const double b) {
|
||||
a.x *= b;
|
||||
a.y *= b;
|
||||
}
|
||||
|
||||
__device__ inline void operator/=(double2 &a, const double b) {
|
||||
a.x /= b;
|
||||
a.y /= b;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator*(double a, double2 b) {
|
||||
double2 res;
|
||||
res.x = b.x * a;
|
||||
res.y = b.y * a;
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,76 @@
|
||||
#ifndef CNCRT_INT128_CUH
|
||||
#define CNCRT_INT128_CUH
|
||||
|
||||
// abseil's int128 type
|
||||
// licensed under Apache license
|
||||
|
||||
class uint128 {
|
||||
public:
|
||||
__device__ uint128(uint64_t high, uint64_t low) : hi_(high), lo_(low) {}
|
||||
|
||||
uint64_t lo_;
|
||||
uint64_t hi_;
|
||||
};
|
||||
|
||||
class int128 {
|
||||
public:
|
||||
int128() = default;
|
||||
|
||||
__device__ operator unsigned long long() const {
|
||||
return static_cast<unsigned long long>(lo_);
|
||||
}
|
||||
|
||||
__device__ int128(int64_t high, uint64_t low) : hi_(high), lo_(low) {}
|
||||
|
||||
uint64_t lo_;
|
||||
int64_t hi_;
|
||||
};
|
||||
|
||||
__device__ inline uint128 make_uint128(uint64_t high, uint64_t low) {
|
||||
return uint128(high, low);
|
||||
}
|
||||
|
||||
template <typename T> __device__ uint128 make_uint128_from_float(T v) {
|
||||
if (v >= ldexp(static_cast<T>(1), 64)) {
|
||||
uint64_t hi = static_cast<uint64_t>(ldexp(v, -64));
|
||||
uint64_t lo = static_cast<uint64_t>(v - ldexp(static_cast<T>(hi), 64));
|
||||
return make_uint128(hi, lo);
|
||||
}
|
||||
|
||||
return make_uint128(0, static_cast<uint64_t>(v));
|
||||
}
|
||||
|
||||
__device__ inline int128 make_int128(int64_t high, uint64_t low) {
|
||||
return int128(high, low);
|
||||
}
|
||||
|
||||
__device__ inline int64_t bitcast_to_signed(uint64_t v) {
|
||||
return v & (uint64_t{1} << 63) ? ~static_cast<int64_t>(~v)
|
||||
: static_cast<int64_t>(v);
|
||||
}
|
||||
|
||||
__device__ inline uint64_t uint128_high64(uint128 v) { return v.hi_; }
|
||||
__device__ inline uint64_t uint128_low64(uint128 v) { return v.lo_; }
|
||||
|
||||
__device__ __forceinline__ uint128 operator-(uint128 val) {
|
||||
uint64_t hi = ~uint128_high64(val);
|
||||
uint64_t lo = ~uint128_low64(val) + 1;
|
||||
if (lo == 0)
|
||||
++hi; // carry
|
||||
return make_uint128(hi, lo);
|
||||
}
|
||||
|
||||
template <typename T> __device__ int128 make_int128_from_float(T v) {
|
||||
|
||||
// We must convert the absolute value and then negate as needed, because
|
||||
// floating point types are typically sign-magnitude. Otherwise, the
|
||||
// difference between the high and low 64 bits when interpreted as two's
|
||||
// complement overwhelms the precision of the mantissa.
|
||||
uint128 result =
|
||||
v < 0 ? -make_uint128_from_float(-v) : make_uint128_from_float(v);
|
||||
|
||||
return make_int128(bitcast_to_signed(uint128_high64(result)),
|
||||
uint128_low64(result));
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,36 @@
|
||||
#ifndef HELPER_CUH
|
||||
#define HELPER_CUH
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
template <typename T> __global__ void print_debug_kernel(T *src, int N) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
printf("%lu, ", src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> void print_debug(const char *name, T *src, int N) {
|
||||
printf("%s: ", name);
|
||||
cudaDeviceSynchronize();
|
||||
print_debug_kernel<<<1, 1>>>(src, N);
|
||||
cudaDeviceSynchronize();
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void print_body_kernel(T *src, int N, int lwe_dimension) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
printf("%lu, ", src[i * (lwe_dimension + 1) + lwe_dimension]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void print_body(const char *name, T *src, int n, int lwe_dimension) {
|
||||
printf("%s: ", name);
|
||||
cudaDeviceSynchronize();
|
||||
print_body_kernel<<<1, 1>>>(src, n, lwe_dimension);
|
||||
cudaDeviceSynchronize();
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,21 @@
|
||||
#ifndef KERNEL_DIMENSIONS_CUH
|
||||
#define KERNEL_DIMENSIONS_CUH
|
||||
|
||||
inline int nextPow2(int x) {
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return ++x;
|
||||
}
|
||||
|
||||
inline void getNumBlocksAndThreads(const int n, const int maxBlockSize,
|
||||
int &blocks, int &threads) {
|
||||
threads =
|
||||
(n < maxBlockSize * 2) ? max(128, nextPow2((n + 1) / 2)) : maxBlockSize;
|
||||
blocks = (n + threads - 1) / threads;
|
||||
}
|
||||
|
||||
#endif // KERNEL_DIMENSIONS_H
|
||||
18
backends/tfhe-cuda-backend/rust_api/Cargo.toml
Normal file
18
backends/tfhe-cuda-backend/rust_api/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "tfhe-cuda-backend"
|
||||
version = "0.1.2"
|
||||
edition = "2021"
|
||||
authors = ["Zama team"]
|
||||
license = "BSD-3-Clause-Clear"
|
||||
description = "Cuda implementation of TFHE-rs primitives."
|
||||
homepage = "https://www.zama.ai/"
|
||||
documentation = "https://docs.zama.ai/tfhe-rs"
|
||||
repository = "https://github.com/zama-ai/tfhe-rs"
|
||||
readme = "README.md"
|
||||
keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
|
||||
|
||||
[build-dependencies]
|
||||
cmake = { version = "0.1" }
|
||||
|
||||
[dependencies]
|
||||
thiserror = "1.0"
|
||||
28
backends/tfhe-cuda-backend/rust_api/LICENSE
Normal file
28
backends/tfhe-cuda-backend/rust_api/LICENSE
Normal file
@@ -0,0 +1,28 @@
|
||||
BSD 3-Clause Clear License
|
||||
|
||||
Copyright © 2023 ZAMA.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
3. Neither the name of ZAMA nor the names of its contributors may be used to endorse
|
||||
or promote products derived from this software without specific prior written permission.
|
||||
|
||||
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE.
|
||||
THIS SOFTWARE IS PROVIDED BY THE ZAMA AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
ZAMA OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
|
||||
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
28
backends/tfhe-cuda-backend/rust_api/build.rs
Normal file
28
backends/tfhe-cuda-backend/rust_api/build.rs
Normal file
@@ -0,0 +1,28 @@
|
||||
use std::env;
|
||||
use std::process::Command;
|
||||
|
||||
fn main() {
|
||||
println!("Build tfhe-cuda-backend");
|
||||
if env::consts::OS == "linux" {
|
||||
let output = Command::new("./get_os_name.sh").output().unwrap();
|
||||
let distribution = String::from_utf8(output.stdout).unwrap();
|
||||
if distribution != "Ubuntu\n" {
|
||||
println!(
|
||||
"cargo:warning=This Linux distribution is not officially supported. \
|
||||
Only Ubuntu is supported by tfhe-cuda-backend at this time. Build may fail\n"
|
||||
);
|
||||
}
|
||||
let dest = cmake::build("../implementation");
|
||||
println!("cargo:rustc-link-search=native={}", dest.display());
|
||||
println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
|
||||
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
|
||||
println!("cargo:rustc-link-lib=gomp");
|
||||
println!("cargo:rustc-link-lib=cudart");
|
||||
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
|
||||
println!("cargo:rustc-link-lib=stdc++");
|
||||
} else {
|
||||
panic!(
|
||||
"Error: platform not supported, tfhe-cuda-backend not built (only Linux is supported)"
|
||||
);
|
||||
}
|
||||
}
|
||||
3
backends/tfhe-cuda-backend/rust_api/get_os_name.sh
Executable file
3
backends/tfhe-cuda-backend/rust_api/get_os_name.sh
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
cat /etc/os-release | grep "\<NAME\>" | sed "s/NAME=\"//g" | sed "s/\"//g"
|
||||
794
backends/tfhe-cuda-backend/rust_api/src/cuda_bind.rs
Normal file
794
backends/tfhe-cuda-backend/rust_api/src/cuda_bind.rs
Normal file
@@ -0,0 +1,794 @@
|
||||
use std::ffi::c_void;
|
||||
|
||||
#[link(name = "tfhe_cuda_backend", kind = "static")]
|
||||
extern "C" {
|
||||
|
||||
/// Create a new Cuda stream on GPU `gpu_index`
|
||||
pub fn cuda_create_stream(gpu_index: u32) -> *mut c_void;
|
||||
|
||||
/// Destroy the Cuda stream `v_stream` on GPU `gpu_index`
|
||||
pub fn cuda_destroy_stream(v_stream: *mut c_void) -> i32;
|
||||
|
||||
/// Allocate `size` memory on GPU `gpu_index` asynchronously
|
||||
pub fn cuda_malloc_async(size: u64, v_stream: *const c_void) -> *mut c_void;
|
||||
|
||||
/// Copy `size` memory asynchronously from `src` on GPU `gpu_index` to `dest` on CPU using
|
||||
/// the Cuda stream `v_stream`.
|
||||
pub fn cuda_memcpy_async_to_cpu(
|
||||
dest: *mut c_void,
|
||||
src: *const c_void,
|
||||
size: u64,
|
||||
v_stream: *const c_void,
|
||||
) -> i32;
|
||||
|
||||
/// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using
|
||||
/// the Cuda stream `v_stream`.
|
||||
pub fn cuda_memcpy_async_to_gpu(
|
||||
dest: *mut c_void,
|
||||
src: *const c_void,
|
||||
size: u64,
|
||||
v_stream: *const c_void,
|
||||
) -> i32;
|
||||
|
||||
/// Copy `size` memory asynchronously from `src` to `dest` on the same GPU `gpu_index` using
|
||||
/// the Cuda stream `v_stream`.
|
||||
pub fn cuda_memcpy_async_gpu_to_gpu(
|
||||
dest: *mut c_void,
|
||||
src: *const c_void,
|
||||
size: u64,
|
||||
v_stream: *const c_void,
|
||||
) -> i32;
|
||||
|
||||
/// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using
|
||||
/// the Cuda stream `v_stream`.
|
||||
pub fn cuda_memset_async(
|
||||
dest: *mut c_void,
|
||||
value: u64,
|
||||
size: u64,
|
||||
v_stream: *const c_void,
|
||||
) -> i32;
|
||||
|
||||
/// Get the total number of Nvidia GPUs detected on the platform
|
||||
pub fn cuda_get_number_of_gpus() -> i32;
|
||||
|
||||
/// Synchronize all streams on GPU `gpu_index`
|
||||
pub fn cuda_synchronize_device(gpu_index: u32) -> i32;
|
||||
|
||||
/// Synchronize Cuda stream
|
||||
pub fn cuda_synchronize_stream(v_stream: *const c_void) -> i32;
|
||||
|
||||
/// Free memory for pointer `ptr` on GPU `gpu_index` asynchronously, using stream `v_stream`
|
||||
pub fn cuda_drop_async(ptr: *mut c_void, v_stream: *const c_void) -> i32;
|
||||
|
||||
/// Free memory for pointer `ptr` on GPU `gpu_index` synchronously
|
||||
pub fn cuda_drop(ptr: *mut c_void) -> i32;
|
||||
|
||||
/// Get the maximum amount of shared memory on GPU `gpu_index`
|
||||
pub fn cuda_get_max_shared_memory(gpu_index: u32) -> i32;
|
||||
|
||||
/// Copy a bootstrap key `src` represented with 64 bits in the standard domain from the CPU to
|
||||
/// the GPU `gpu_index` using the stream `v_stream`, and convert it to the Fourier domain on the
|
||||
/// GPU. The resulting bootstrap key `dest` on the GPU is an array of f64 values.
|
||||
pub fn cuda_convert_lwe_bootstrap_key_64(
|
||||
dest: *mut c_void,
|
||||
src: *const c_void,
|
||||
v_stream: *const c_void,
|
||||
input_lwe_dim: u32,
|
||||
glwe_dim: u32,
|
||||
level_count: u32,
|
||||
polynomial_size: u32,
|
||||
);
|
||||
|
||||
/// Copy a multi-bit bootstrap key `src` represented with 64 bits in the standard domain from
|
||||
/// the CPU to the GPU `gpu_index` using the stream `v_stream`. The resulting bootstrap key
|
||||
/// `dest` on the GPU is an array of uint64_t values.
|
||||
pub fn cuda_convert_lwe_multi_bit_bootstrap_key_64(
|
||||
dest: *mut c_void,
|
||||
src: *const c_void,
|
||||
v_stream: *const c_void,
|
||||
input_lwe_dim: u32,
|
||||
glwe_dim: u32,
|
||||
level_count: u32,
|
||||
polynomial_size: u32,
|
||||
grouping_factor: u32,
|
||||
);
|
||||
|
||||
/// Copy `number_of_cts` LWE ciphertext represented with 64 bits in the standard domain from the
|
||||
/// CPU to the GPU `gpu_index` using the stream `v_stream`. All ciphertexts must be
|
||||
/// concatenated.
|
||||
pub fn cuda_convert_lwe_ciphertext_vector_to_gpu_64(
|
||||
dest: *mut c_void,
|
||||
src: *mut c_void,
|
||||
v_stream: *const c_void,
|
||||
number_of_cts: u32,
|
||||
lwe_dimension: u32,
|
||||
);
|
||||
|
||||
/// Copy `number_of_cts` LWE ciphertext represented with 64 bits in the standard domain from the
|
||||
/// GPU to the CPU `gpu_index` using the stream `v_stream`. All ciphertexts must be
|
||||
/// concatenated.
|
||||
pub fn cuda_convert_lwe_ciphertext_vector_to_cpu_64(
|
||||
dest: *mut c_void,
|
||||
src: *mut c_void,
|
||||
v_stream: *const c_void,
|
||||
number_of_cts: u32,
|
||||
lwe_dimension: u32,
|
||||
);
|
||||
|
||||
/// This scratch function allocates the necessary amount of data on the GPU for
|
||||
/// the low latency PBS on 64-bit inputs, into `pbs_buffer`. It also configures SM
|
||||
/// options on the GPU in case FULLSM or PARTIALSM mode are going to be used.
|
||||
pub fn scratch_cuda_bootstrap_low_latency_64(
|
||||
v_stream: *const c_void,
|
||||
pbs_buffer: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
level_count: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
max_shared_memory: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
|
||||
/// Perform bootstrapping on a batch of input u64 LWE ciphertexts.
|
||||
///
|
||||
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
|
||||
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
/// - `lwe_array_out`: output batch of num_samples bootstrapped ciphertexts c =
|
||||
/// (a0,..an-1,b) where n is the LWE dimension
|
||||
/// - `lut_vector`: should hold as many test vectors of size polynomial_size
|
||||
/// as there are input ciphertexts, but actually holds
|
||||
/// `num_lut_vectors` vectors to reduce memory usage
|
||||
/// - `lut_vector_indexes`: stores the index corresponding to
|
||||
/// which test vector to use for each sample in
|
||||
/// `lut_vector`
|
||||
/// - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing n
|
||||
/// mask values + 1 body value
|
||||
/// - `bootstrapping_key`: GGSW encryption of the LWE secret key sk1
|
||||
/// under secret key sk2.
|
||||
/// bsk = Z + sk1 H
|
||||
/// where H is the gadget matrix and Z is a matrix (k+1).l
|
||||
/// containing GLWE encryptions of 0 under sk2.
|
||||
/// bsk is thus a tensor of size (k+1)^2.l.N.n
|
||||
/// where l is the number of decomposition levels and
|
||||
/// k is the GLWE dimension, N is the polynomial size for
|
||||
/// GLWE. The polynomial size for GLWE and the test vector
|
||||
/// are the same because they have to be in the same ring
|
||||
/// to be multiplied.
|
||||
/// - `pbs_buffer`: a preallocated buffer to store temporary results
|
||||
/// - `lwe_dimension`: size of the Torus vector used to encrypt the input
|
||||
/// LWE ciphertexts - referred to as n above (~ 600)
|
||||
/// - `glwe_dimension`: size of the polynomial vector used to encrypt the LUT
|
||||
/// GLWE ciphertexts - referred to as k above. Only the value 1 is supported for this parameter.
|
||||
/// - `polynomial_size`: size of the test polynomial (test vector) and size of the
|
||||
/// GLWE polynomial (~1024)
|
||||
/// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
/// - `level_count`: number of decomposition levels in the gadget matrix (~4)
|
||||
/// - `num_samples`: number of encrypted input messages
|
||||
/// - `num_lut_vectors`: parameter to set the actual number of test vectors to be
|
||||
/// used
|
||||
/// - `lwe_idx`: the index of the LWE input to consider for the GPU of index gpu_index. In
|
||||
/// case of multi-GPU computing, it is assumed that only a part of the input LWE array is
|
||||
/// copied to each GPU, but the whole LUT array is copied (because the case when the number
|
||||
/// of LUTs is smaller than the number of input LWEs is not trivial to take into account in
|
||||
/// the data repartition on the GPUs). `lwe_idx` is used to determine which LUT to consider
|
||||
/// for a given LWE input in the LUT array `lut_vector`.
|
||||
/// - `max_shared_memory` maximum amount of shared memory to be used inside
|
||||
/// device functions
|
||||
///
|
||||
/// This function calls a wrapper to a device kernel that performs the
|
||||
/// bootstrapping:
|
||||
/// - the kernel is templatized based on integer discretization and
|
||||
/// polynomial degree
|
||||
/// - num_samples * level_count * (glwe_dimension + 1) blocks of threads are launched, where
|
||||
/// each thread is going to handle one or more polynomial coefficients at each stage,
|
||||
/// for a given level of decomposition, either for the LUT mask or its body:
|
||||
/// - perform the blind rotation
|
||||
/// - round the result
|
||||
/// - get the decomposition for the current level
|
||||
/// - switch to the FFT domain
|
||||
/// - multiply with the bootstrapping key
|
||||
/// - come back to the coefficients representation
|
||||
/// - between each stage a synchronization of the threads is necessary (some
|
||||
/// synchronizations
|
||||
/// happen at the block level, some happen between blocks, using cooperative groups).
|
||||
/// - in case the device has enough shared memory, temporary arrays used for
|
||||
/// the different stages (accumulators) are stored into the shared memory
|
||||
/// - the accumulators serve to combine the results for all decomposition
|
||||
/// levels
|
||||
/// - the constant memory (64K) is used for storing the roots of identity
|
||||
/// values for the FFT
|
||||
pub fn cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
v_stream: *const c_void,
|
||||
lwe_array_out: *mut c_void,
|
||||
lwe_output_indexes: *const c_void,
|
||||
lut_vector: *const c_void,
|
||||
lut_vector_indexes: *const c_void,
|
||||
lwe_array_in: *const c_void,
|
||||
lwe_input_indexes: *const c_void,
|
||||
bootstrapping_key: *const c_void,
|
||||
pbs_buffer: *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
base_log: u32,
|
||||
level: u32,
|
||||
num_samples: u32,
|
||||
num_lut_vectors: u32,
|
||||
lwe_idx: u32,
|
||||
max_shared_memory: u32,
|
||||
);
|
||||
|
||||
/// This cleanup function frees the data for the low latency PBS on GPU
|
||||
/// contained in pbs_buffer for 32 or 64-bit inputs.
|
||||
pub fn cleanup_cuda_bootstrap_low_latency(v_stream: *const c_void, pbs_buffer: *mut *mut i8);
|
||||
|
||||
/// This scratch function allocates the necessary amount of data on the GPU for
|
||||
/// the multi-bit PBS on 64-bit inputs into `pbs_buffer`.
|
||||
pub fn scratch_cuda_multi_bit_pbs_64(
|
||||
v_stream: *const c_void,
|
||||
pbs_buffer: *mut *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
level_count: u32,
|
||||
grouping_factor: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
max_shared_memory: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
lwe_chunk_size: u32,
|
||||
);
|
||||
|
||||
/// Perform bootstrapping on a batch of input u64 LWE ciphertexts using the multi-bit algorithm.
|
||||
///
|
||||
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
|
||||
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
/// - `lwe_array_out`: output batch of num_samples bootstrapped ciphertexts c =
|
||||
/// (a0,..an-1,b) where n is the LWE dimension
|
||||
/// - `lut_vector`: should hold as many test vectors of size polynomial_size
|
||||
/// as there are input ciphertexts, but actually holds
|
||||
/// `num_lut_vectors` vectors to reduce memory usage
|
||||
/// - `lut_vector_indexes`: stores the index corresponding to
|
||||
/// which test vector to use for each sample in
|
||||
/// `lut_vector`
|
||||
/// - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing n
|
||||
/// mask values + 1 body value
|
||||
/// - `bootstrapping_key`: GGSW encryption of elements of the LWE secret key as in the
|
||||
/// classical PBS, but this time we follow Zhou's trick and encrypt combinations of elements
|
||||
/// of the key
|
||||
/// - `pbs_buffer`: a preallocated buffer to store temporary results
|
||||
/// - `lwe_dimension`: size of the Torus vector used to encrypt the input
|
||||
/// LWE ciphertexts - referred to as n above (~ 600)
|
||||
/// - `glwe_dimension`: size of the polynomial vector used to encrypt the LUT
|
||||
/// GLWE ciphertexts - referred to as k above. Only the value 1 is supported for this parameter.
|
||||
/// - `polynomial_size`: size of the test polynomial (test vector) and size of the
|
||||
/// GLWE polynomial (~1024)
|
||||
/// - `grouping_factor`: number of elements of the LWE secret key combined per GGSW of the
|
||||
/// bootstrap key
|
||||
/// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
/// - `level_count`: number of decomposition levels in the gadget matrix (~4)
|
||||
/// - `num_samples`: number of encrypted input messages
|
||||
/// - `num_lut_vectors`: parameter to set the actual number of test vectors to be
|
||||
/// used
|
||||
/// - `lwe_idx`: the index of the LWE input to consider for the GPU of index gpu_index. In
|
||||
/// case of multi-GPU computing, it is assumed that only a part of the input LWE array is
|
||||
/// copied to each GPU, but the whole LUT array is copied (because the case when the number
|
||||
/// of LUTs is smaller than the number of input LWEs is not trivial to take into account in
|
||||
/// the data repartition on the GPUs). `lwe_idx` is used to determine which LUT to consider
|
||||
/// for a given LWE input in the LUT array `lut_vector`.
|
||||
/// - `max_shared_memory` maximum amount of shared memory to be used inside
|
||||
/// device functions
|
||||
pub fn cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
v_stream: *const c_void,
|
||||
lwe_array_out: *mut c_void,
|
||||
lwe_output_indexes: *const c_void,
|
||||
lut_vector: *const c_void,
|
||||
lut_vector_indexes: *const c_void,
|
||||
lwe_array_in: *const c_void,
|
||||
lwe_input_indexes: *const c_void,
|
||||
bootstrapping_key: *const c_void,
|
||||
pbs_buffer: *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
grouping_factor: u32,
|
||||
base_log: u32,
|
||||
level: u32,
|
||||
num_samples: u32,
|
||||
num_lut_vectors: u32,
|
||||
lwe_idx: u32,
|
||||
max_shared_memory: u32,
|
||||
lwe_chunk_size: u32,
|
||||
);
|
||||
|
||||
/// This cleanup function frees the data for the multi-bit PBS on GPU
|
||||
/// contained in pbs_buffer for 64-bit inputs.
|
||||
pub fn cleanup_cuda_multi_bit_pbs(v_stream: *const c_void, pbs_buffer: *mut *mut i8);
|
||||
|
||||
/// Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
|
||||
///
|
||||
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
|
||||
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
/// - `lwe_array_out`: output batch of num_samples keyswitched ciphertexts c =
|
||||
/// (a0,..an-1,b) where n is the output LWE dimension (lwe_dimension_out)
|
||||
/// - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing lwe_dimension_in
|
||||
/// mask values + 1 body value
|
||||
/// - `ksk`: the keyswitch key to be used in the operation
|
||||
/// - `base_log`: the log of the base used in the decomposition (should be the one used to
|
||||
/// create the ksk).
|
||||
/// - `level_count`: the number of levels used in the decomposition (should be the one used to
|
||||
/// create the ksk).
|
||||
/// - `num_samples`: the number of input and output LWE ciphertexts.
|
||||
///
|
||||
/// This function calls a wrapper to a device kernel that performs the keyswitch.
|
||||
/// `num_samples` blocks of threads are launched
|
||||
pub fn cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
v_stream: *const c_void,
|
||||
lwe_array_out: *mut c_void,
|
||||
lwe_output_indexes: *const c_void,
|
||||
lwe_array_in: *const c_void,
|
||||
lwe_input_indexes: *const c_void,
|
||||
keyswitch_key: *const c_void,
|
||||
input_lwe_dimension: u32,
|
||||
output_lwe_dimension: u32,
|
||||
base_log: u32,
|
||||
level_count: u32,
|
||||
num_samples: u32,
|
||||
);
|
||||
|
||||
/// Perform the negation of a u64 input LWE ciphertext vector.
|
||||
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
|
||||
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
/// - `lwe_array_out` is an array of size
|
||||
/// `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have been allocated on
|
||||
/// the GPU before calling this function, and that will hold the result of the computation.
|
||||
/// - `lwe_array_in` is the LWE ciphertext vector used as input, it should have been
|
||||
/// allocated and initialized before calling this function. It has the same size as the output
|
||||
/// array.
|
||||
/// - `input_lwe_dimension` is the number of mask elements in the two input and in the output
|
||||
/// ciphertext vectors
|
||||
/// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each input LWE
|
||||
/// ciphertext vector, as well as in the output.
|
||||
///
|
||||
/// Each element (mask element or body) of the input LWE ciphertext vector is negated.
|
||||
/// The result is stored in the output LWE ciphertext vector. The input LWE ciphertext vector
|
||||
/// is left unchanged. This function is a wrapper to a device function that performs the
|
||||
/// operation on the GPU.
|
||||
pub fn cuda_negate_lwe_ciphertext_vector_64(
|
||||
v_stream: *const c_void,
|
||||
lwe_array_out: *mut c_void,
|
||||
lwe_array_in: *const c_void,
|
||||
input_lwe_dimension: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
);
|
||||
|
||||
pub fn cuda_negate_integer_radix_ciphertext_64_inplace(
|
||||
v_stream: *const c_void,
|
||||
lwe_array: *mut c_void,
|
||||
lwe_dimension: u32,
|
||||
lwe_ciphertext_count: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
);
|
||||
|
||||
/// Perform the addition of two u64 input LWE ciphertext vectors.
|
||||
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
|
||||
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
/// - `lwe_array_out` is an array of size
|
||||
/// `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have been allocated on
|
||||
/// the GPU before calling this function, and that will hold the result of the computation.
|
||||
/// - `lwe_array_in_1` is the first LWE ciphertext vector used as input, it should have been
|
||||
/// allocated and initialized before calling this function. It has the same size as the output
|
||||
/// array.
|
||||
/// - `lwe_array_in_2` is the second LWE ciphertext vector used as input, it should have been
|
||||
/// allocated and initialized before calling this function. It has the same size as the output
|
||||
/// array.
|
||||
/// - `input_lwe_dimension` is the number of mask elements in the two input and in the output
|
||||
/// ciphertext vectors
|
||||
/// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each input LWE
|
||||
/// ciphertext vector, as well as in the output.
|
||||
///
|
||||
/// Each element (mask element or body) of the input LWE ciphertext vector 1 is added to the
|
||||
/// corresponding element in the input LWE ciphertext 2. The result is stored in the output LWE
|
||||
/// ciphertext vector. The two input LWE ciphertext vectors are left unchanged. This function is
|
||||
/// a wrapper to a device function that performs the operation on the GPU.
|
||||
pub fn cuda_add_lwe_ciphertext_vector_64(
|
||||
v_stream: *const c_void,
|
||||
lwe_array_out: *mut c_void,
|
||||
lwe_array_in_1: *const c_void,
|
||||
lwe_array_in_2: *const c_void,
|
||||
input_lwe_dimension: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
);
|
||||
|
||||
/// Perform the addition of a u64 input LWE ciphertext vector with a u64 input plaintext vector.
|
||||
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
|
||||
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
/// - `lwe_array_out` is an array of size
|
||||
/// `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have been allocated
|
||||
/// on the GPU before calling this function, and that will hold the result of the computation.
|
||||
/// - `lwe_array_in` is the LWE ciphertext vector used as input, it should have been
|
||||
/// allocated and initialized before calling this function. It has the same size as the output
|
||||
/// array.
|
||||
/// - `plaintext_array_in` is the plaintext vector used as input, it should have been
|
||||
/// allocated and initialized before calling this function. It should be of size
|
||||
/// `input_lwe_ciphertext_count`.
|
||||
/// - `input_lwe_dimension` is the number of mask elements in the input and output LWE
|
||||
/// ciphertext vectors
|
||||
/// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the input LWE
|
||||
/// ciphertext vector, as well as in the output. It is also the number of plaintexts in the
|
||||
/// input plaintext vector.
|
||||
///
|
||||
/// Each plaintext of the input plaintext vector is added to the body of the corresponding LWE
|
||||
/// ciphertext in the LWE ciphertext vector. The result of the operation is stored in the output
|
||||
/// LWE ciphertext vector. The two input vectors are unchanged. This function is a
|
||||
/// wrapper to a device function that performs the operation on the GPU.
|
||||
pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
v_stream: *const c_void,
|
||||
lwe_array_out: *mut c_void,
|
||||
lwe_array_in: *const c_void,
|
||||
plaintext_array_in: *const c_void,
|
||||
input_lwe_dimension: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
);
|
||||
|
||||
/// Perform the multiplication of a u64 input LWE ciphertext vector with a u64 input cleartext
|
||||
/// vector.
|
||||
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
|
||||
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
/// - `lwe_array_out` is an array of size
|
||||
/// `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have been allocated
|
||||
/// on the GPU before calling this function, and that will hold the result of the computation.
|
||||
/// - `lwe_array_in` is the LWE ciphertext vector used as input, it should have been
|
||||
/// allocated and initialized before calling this function. It has the same size as the output
|
||||
/// array.
|
||||
/// - `cleartext_array_in` is the cleartext vector used as input, it should have been
|
||||
/// allocated and initialized before calling this function. It should be of size
|
||||
/// `input_lwe_ciphertext_count`.
|
||||
/// - `input_lwe_dimension` is the number of mask elements in the input and output LWE
|
||||
/// ciphertext vectors
|
||||
/// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the input LWE
|
||||
/// ciphertext vector, as well as in the output. It is also the number of cleartexts in the
|
||||
/// input cleartext vector.
|
||||
///
|
||||
/// Each cleartext of the input cleartext vector is multiplied to the mask and body of the
|
||||
/// corresponding LWE ciphertext in the LWE ciphertext vector.
|
||||
/// The result of the operation is stored in the output
|
||||
/// LWE ciphertext vector. The two input vectors are unchanged. This function is a
|
||||
/// wrapper to a device function that performs the operation on the GPU.
|
||||
pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
v_stream: *const c_void,
|
||||
lwe_array_out: *mut c_void,
|
||||
lwe_array_in: *const c_void,
|
||||
cleartext_array_in: *const c_void,
|
||||
input_lwe_dimension: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
);
|
||||
|
||||
pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
v_stream: *const c_void,
|
||||
mem_ptr: *mut *mut i8,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
glwe_dimension: u32,
|
||||
lwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
pbs_base_log: u32,
|
||||
pbs_level: u32,
|
||||
ks_base_log: u32,
|
||||
ks_level: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
pbs_type: u32,
|
||||
max_shared_memory: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
v_stream: *const c_void,
|
||||
radix_lwe_out: *mut c_void,
|
||||
radix_lwe_left: *const c_void,
|
||||
radix_lwe_right: *const c_void,
|
||||
bsk: *const c_void,
|
||||
ksk: *const c_void,
|
||||
mem_ptr: *mut i8,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
glwe_dimension: u32,
|
||||
lwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
pbs_base_log: u32,
|
||||
pbs_level: u32,
|
||||
ks_base_log: u32,
|
||||
ks_level: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
pbs_type: u32,
|
||||
max_shared_memory: u32,
|
||||
);
|
||||
|
||||
pub fn cleanup_cuda_integer_mult(v_stream: *const c_void, mem_ptr: *mut *mut i8);
|
||||
|
||||
pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
v_stream: *const c_void,
|
||||
lwe_array: *mut c_void,
|
||||
scalar_input: *const c_void,
|
||||
lwe_dimension: u32,
|
||||
lwe_ciphertext_count: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
);
|
||||
|
||||
pub fn cuda_small_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
v_stream: *const c_void,
|
||||
lwe_array: *mut c_void,
|
||||
scalar_input: u64,
|
||||
lwe_dimension: u32,
|
||||
lwe_ciphertext_count: u32,
|
||||
);
|
||||
|
||||
pub fn scratch_cuda_integer_radix_bitop_kb_64(
|
||||
v_stream: *const c_void,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: u32,
|
||||
op_type: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
v_stream: *const c_void,
|
||||
radix_lwe_out: *mut c_void,
|
||||
radix_lwe_left: *const c_void,
|
||||
radix_lwe_right: *const c_void,
|
||||
mem_ptr: *mut i8,
|
||||
bsk: *const c_void,
|
||||
ksk: *const c_void,
|
||||
num_blocks: u32,
|
||||
);
|
||||
|
||||
pub fn cuda_bitnot_integer_radix_ciphertext_kb_64(
|
||||
v_stream: *const c_void,
|
||||
radix_lwe_out: *mut c_void,
|
||||
radix_lwe_in: *const c_void,
|
||||
mem_ptr: *mut i8,
|
||||
bsk: *const c_void,
|
||||
ksk: *const c_void,
|
||||
num_blocks: u32,
|
||||
);
|
||||
|
||||
pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
v_stream: *const c_void,
|
||||
radix_lwe_output: *mut c_void,
|
||||
radix_lwe_input: *mut c_void,
|
||||
clear_blocks: *const c_void,
|
||||
num_clear_blocks: u32,
|
||||
mem_ptr: *mut i8,
|
||||
bsk: *const c_void,
|
||||
ksk: *const c_void,
|
||||
num_blocks: u32,
|
||||
op_type: u32,
|
||||
);
|
||||
|
||||
pub fn cleanup_cuda_integer_bitop(v_stream: *const c_void, mem_ptr: *mut *mut i8);
|
||||
|
||||
pub fn scratch_cuda_integer_radix_comparison_kb_64(
|
||||
v_stream: *const c_void,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: u32,
|
||||
op_type: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
v_stream: *const c_void,
|
||||
radix_lwe_out: *mut c_void,
|
||||
radix_lwe_left: *const c_void,
|
||||
radix_lwe_right: *const c_void,
|
||||
mem_ptr: *mut i8,
|
||||
bsk: *const c_void,
|
||||
ksk: *const c_void,
|
||||
num_blocks: u32,
|
||||
);
|
||||
|
||||
pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
v_stream: *const c_void,
|
||||
radix_lwe_out: *mut c_void,
|
||||
radix_lwe_in: *const c_void,
|
||||
scalar_blocks: *const c_void,
|
||||
mem_ptr: *mut i8,
|
||||
bsk: *const c_void,
|
||||
ksk: *const c_void,
|
||||
num_blocks: u32,
|
||||
num_scalar_blocks: u32,
|
||||
);
|
||||
|
||||
pub fn cleanup_cuda_integer_comparison(v_stream: *const c_void, mem_ptr: *mut *mut i8);
|
||||
|
||||
pub fn scratch_cuda_full_propagation_64(
|
||||
v_stream: *const c_void,
|
||||
mem_ptr: *mut *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
pbs_level: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_full_propagation_64_inplace(
|
||||
v_stream: *const c_void,
|
||||
radix_lwe_right: *mut c_void,
|
||||
mem_ptr: *mut i8,
|
||||
ksk: *const c_void,
|
||||
bsk: *const c_void,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
ks_base_log: u32,
|
||||
ks_level: u32,
|
||||
pbs_base_log: u32,
|
||||
pbs_level: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
);
|
||||
|
||||
pub fn cleanup_cuda_full_propagation(v_stream: *const c_void, mem_ptr: *mut *mut i8);
|
||||
|
||||
pub fn scratch_cuda_integer_radix_scalar_shift_kb_64(
|
||||
v_stream: *const c_void,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: u32,
|
||||
shift_type: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_integer_radix_scalar_shift_kb_64_inplace(
|
||||
v_stream: *const c_void,
|
||||
radix_lwe: *mut c_void,
|
||||
shift: u32,
|
||||
mem_ptr: *mut i8,
|
||||
bsk: *const c_void,
|
||||
ksk: *const c_void,
|
||||
num_blocks: u32,
|
||||
);
|
||||
|
||||
pub fn cleanup_cuda_integer_radix_scalar_shift(v_stream: *const c_void, mem_ptr: *mut *mut i8);
|
||||
|
||||
pub fn scratch_cuda_integer_radix_cmux_kb_64(
|
||||
v_stream: *const c_void,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
v_stream: *const c_void,
|
||||
lwe_array_out: *mut c_void,
|
||||
lwe_condition: *const c_void,
|
||||
lwe_array_true: *const c_void,
|
||||
lwe_array_false: *const c_void,
|
||||
mem_ptr: *mut i8,
|
||||
bsk: *const c_void,
|
||||
ksk: *const c_void,
|
||||
num_blocks: u32,
|
||||
);
|
||||
|
||||
pub fn cleanup_cuda_integer_radix_cmux(v_stream: *const c_void, mem_ptr: *mut *mut i8);
|
||||
|
||||
pub fn scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
v_stream: *const c_void,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: u32,
|
||||
shift_type: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
v_stream: *const c_void,
|
||||
radix_lwe: *mut c_void,
|
||||
n: u32,
|
||||
mem_ptr: *mut i8,
|
||||
bsk: *const c_void,
|
||||
ksk: *const c_void,
|
||||
num_blocks: u32,
|
||||
);
|
||||
|
||||
pub fn cleanup_cuda_integer_radix_scalar_rotate(v_stream: *const c_void, mem_ptr: *mut *mut i8);
|
||||
|
||||
pub fn scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(
|
||||
v_stream: *const c_void,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_propagate_single_carry_low_latency_kb_64_inplace(
|
||||
v_stream: *const c_void,
|
||||
radix_lwe: *mut c_void,
|
||||
mem_ptr: *mut i8,
|
||||
bsk: *const c_void,
|
||||
ksk: *const c_void,
|
||||
num_blocks: u32,
|
||||
);
|
||||
|
||||
pub fn cleanup_cuda_propagate_single_carry_low_latency(
|
||||
v_stream: *const c_void,
|
||||
mem_ptr: *mut *mut i8,
|
||||
);
|
||||
|
||||
}
|
||||
1
backends/tfhe-cuda-backend/rust_api/src/lib.rs
Normal file
1
backends/tfhe-cuda-backend/rust_api/src/lib.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub mod cuda_bind;
|
||||
@@ -1,4 +1,7 @@
|
||||
{
|
||||
"m6i.metal": 7.168,
|
||||
"hpc7a.96xlarge": 7.7252
|
||||
"hpc7a.96xlarge": 7.7252,
|
||||
"p3.2xlarge": 3.06,
|
||||
"p4d.24xlarge": 32.7726,
|
||||
"p5.48xlarge": 98.32
|
||||
}
|
||||
|
||||
53
ci/slab.toml
53
ci/slab.toml
@@ -18,6 +18,31 @@ region = "eu-west-1"
|
||||
image_id = "ami-0e88d98b86aff13de"
|
||||
instance_type = "hpc7a.96xlarge"
|
||||
|
||||
[profile.gpu-test]
|
||||
region = "us-east-1"
|
||||
image_id = "ami-05b4b37bcbb24dc48"
|
||||
instance_type = "p3.2xlarge"
|
||||
# One spawn attempt every 30 seconds for 1 hour
|
||||
spawn_retry_attempts = 120
|
||||
spawn_retry_duration = 60
|
||||
|
||||
[profile.gpu-bench]
|
||||
region = "us-east-1"
|
||||
image_id = "ami-05b4b37bcbb24dc48"
|
||||
instance_type = "p4d.24xlarge"
|
||||
# One spawn attempt every 30 seconds for 6 hours
|
||||
spawn_retry_attempts = 720
|
||||
spawn_retry_duration = 360
|
||||
max_spot_hourly_price = "100.0"
|
||||
|
||||
[profile.gpu-bench-big]
|
||||
region = "us-east-1"
|
||||
image_id = "ami-05b4b37bcbb24dc48"
|
||||
instance_type = "p5.48xlarge"
|
||||
spawn_retry_attempts = 720
|
||||
spawn_retry_duration = 360
|
||||
max_spot_hourly_price = "150.0"
|
||||
|
||||
[command.cpu_test]
|
||||
workflow = "aws_tfhe_tests.yml"
|
||||
profile = "cpu-big"
|
||||
@@ -43,21 +68,36 @@ workflow = "aws_tfhe_fast_tests.yml"
|
||||
profile = "cpu-big"
|
||||
check_run_name = "CPU AWS Fast Tests"
|
||||
|
||||
[command.integer_full_bench]
|
||||
workflow = "integer_full_benchmark.yml"
|
||||
profile = "bench"
|
||||
check_run_name = "Integer CPU AWS Benchmarks Full Suite"
|
||||
[command.gpu_test]
|
||||
workflow = "aws_tfhe_gpu_tests.yml"
|
||||
profile = "gpu-test"
|
||||
check_run_name = "GPU AWS Tests"
|
||||
|
||||
[command.signed_integer_full_bench]
|
||||
workflow = "signed_integer_full_benchmark.yml"
|
||||
profile = "bench"
|
||||
check_run_name = "Signed Integer CPU AWS Benchmarks Full Suite"
|
||||
|
||||
[command.integer_full_bench]
|
||||
workflow = "integer_full_benchmark.yml"
|
||||
profile = "bench"
|
||||
check_run_name = "Integer CPU AWS Benchmarks Full Suite"
|
||||
|
||||
[command.integer_gpu_full_bench]
|
||||
workflow = "integer_gpu_full_benchmark.yml"
|
||||
profile = "gpu-test" # p3.2xlarge is the baseline for GPU benchmarks
|
||||
check_run_name = "Integer GPU AWS Benchmarks Full Suite"
|
||||
|
||||
[command.integer_bench]
|
||||
workflow = "integer_benchmark.yml"
|
||||
profile = "bench"
|
||||
check_run_name = "Integer CPU AWS Benchmarks"
|
||||
|
||||
[command.integer_gpu_bench]
|
||||
workflow = "integer_gpu_benchmark.yml"
|
||||
profile = "gpu-test"
|
||||
check_run_name = "Integer GPU AWS Benchmarks"
|
||||
|
||||
[command.integer_multi_bit_bench]
|
||||
workflow = "integer_multi_bit_benchmark.yml"
|
||||
profile = "bench"
|
||||
@@ -73,6 +113,11 @@ workflow = "signed_integer_multi_bit_benchmark.yml"
|
||||
profile = "bench"
|
||||
check_run_name = "Signed integer multi bit CPU AWS Benchmarks"
|
||||
|
||||
[command.integer_multi_bit_gpu_bench]
|
||||
workflow = "integer_multi_bit_gpu_benchmark.yml"
|
||||
profile = "gpu-bench"
|
||||
check_run_name = "Integer multi bit GPU AWS Benchmarks"
|
||||
|
||||
[command.shortint_full_bench]
|
||||
workflow = "shortint_full_benchmark.yml"
|
||||
profile = "bench"
|
||||
|
||||
@@ -60,6 +60,7 @@ rayon = { version = "1.5.0" }
|
||||
bincode = "1.3.3"
|
||||
concrete-fft = { version = "0.3.0", features = ["serde", "fft128"] }
|
||||
pulp = "0.13"
|
||||
tfhe-cuda-backend = {path = "../backends/tfhe-cuda-backend/rust_api", optional = true}
|
||||
aligned-vec = { version = "0.5", features = ["serde"] }
|
||||
dyn-stack = { version = "0.9" }
|
||||
paste = "1.0.7"
|
||||
@@ -83,6 +84,7 @@ boolean = []
|
||||
shortint = []
|
||||
integer = ["shortint"]
|
||||
internal-keycache = ["dep:lazy_static", "dep:fs2"]
|
||||
gpu = ["tfhe-cuda-backend"]
|
||||
|
||||
# Experimental section
|
||||
experimental = []
|
||||
|
||||
@@ -21,6 +21,7 @@ use tfhe::integer::U256;
|
||||
use tfhe::shortint::parameters::{
|
||||
PARAM_MESSAGE_1_CARRY_1_KS_PBS, PARAM_MESSAGE_2_CARRY_2_KS_PBS, PARAM_MESSAGE_3_CARRY_3_KS_PBS,
|
||||
PARAM_MESSAGE_4_CARRY_4_KS_PBS, PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS,
|
||||
PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS,
|
||||
};
|
||||
|
||||
/// The type used to hold scalar values
|
||||
@@ -57,6 +58,11 @@ impl Default for ParamsAndNumBlocksIter {
|
||||
Err(_) => false,
|
||||
};
|
||||
|
||||
let is_gpu = match env::var("__TFHE_RS_BENCH_OP_FLAVOR") {
|
||||
Ok(val) => val.contains("gpu"),
|
||||
Err(_) => false,
|
||||
};
|
||||
|
||||
let bit_sizes = if is_fast_bench {
|
||||
FAST_BENCH_BIT_SIZES.to_vec()
|
||||
} else {
|
||||
@@ -64,7 +70,18 @@ impl Default for ParamsAndNumBlocksIter {
|
||||
};
|
||||
|
||||
if is_multi_bit {
|
||||
let params = vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS.into()];
|
||||
let params = if is_gpu {
|
||||
vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS.into()]
|
||||
} else {
|
||||
vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS.into()]
|
||||
};
|
||||
|
||||
let bit_sizes = if is_fast_bench {
|
||||
vec![32]
|
||||
} else {
|
||||
BENCH_BIT_SIZES.to_vec()
|
||||
};
|
||||
|
||||
let params_and_bit_sizes = iproduct!(params, bit_sizes);
|
||||
Self {
|
||||
params_and_bit_sizes,
|
||||
@@ -77,6 +94,7 @@ impl Default for ParamsAndNumBlocksIter {
|
||||
// PARAM_MESSAGE_3_CARRY_3_KS_PBS.into(),
|
||||
// PARAM_MESSAGE_4_CARRY_4_KS_PBS.into(),
|
||||
];
|
||||
|
||||
let params_and_bit_sizes = iproduct!(params, bit_sizes);
|
||||
Self {
|
||||
params_and_bit_sizes,
|
||||
@@ -1136,6 +1154,709 @@ define_server_key_bench_default_fn!(
|
||||
display_name: rotate_right
|
||||
);
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
mod cuda {
|
||||
use super::{default_scalar, shift_scalar, ParamsAndNumBlocksIter, ScalarType};
|
||||
use crate::utilities::{write_to_json, OperatorType};
|
||||
use criterion::{criterion_group, Criterion};
|
||||
use rand::prelude::*;
|
||||
use tfhe::core_crypto::gpu::{CudaDevice, CudaStream};
|
||||
use tfhe::integer::gpu::ciphertext::CudaRadixCiphertext;
|
||||
use tfhe::integer::gpu::server_key::CudaServerKey;
|
||||
use tfhe::integer::keycache::KEY_CACHE;
|
||||
use tfhe::integer::IntegerKeyKind;
|
||||
use tfhe::keycache::NamedParam;
|
||||
|
||||
fn bench_cuda_server_key_unary_function_clean_inputs<F>(
|
||||
c: &mut Criterion,
|
||||
bench_name: &str,
|
||||
display_name: &str,
|
||||
unary_op: F,
|
||||
) where
|
||||
F: Fn(&CudaServerKey, &mut CudaRadixCiphertext, &CudaStream),
|
||||
{
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
.sample_size(15)
|
||||
.measurement_time(std::time::Duration::from_secs(60));
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
let gpu_index = 0;
|
||||
let device = CudaDevice::new(gpu_index);
|
||||
let stream = CudaStream::new_unchecked(device);
|
||||
|
||||
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
|
||||
let param_name = param.name();
|
||||
|
||||
let bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
|
||||
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
let encrypt_two_values = || {
|
||||
let clearlow = rng.gen::<u128>();
|
||||
let clearhigh = rng.gen::<u128>();
|
||||
let clear_0 = tfhe::integer::U256::from((clearlow, clearhigh));
|
||||
let ct_0 = cks.encrypt_radix(clear_0, num_block);
|
||||
|
||||
let d_ctxt_1 = CudaRadixCiphertext::from_radix_ciphertext(&ct_0, &stream);
|
||||
|
||||
d_ctxt_1
|
||||
};
|
||||
|
||||
b.iter_batched(
|
||||
encrypt_two_values,
|
||||
|mut ct_0| {
|
||||
unary_op(&gpu_sks, &mut ct_0, &stream);
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
|
||||
write_to_json::<u64, _>(
|
||||
&bench_id,
|
||||
param,
|
||||
param.name(),
|
||||
display_name,
|
||||
&OperatorType::Atomic,
|
||||
bit_size as u32,
|
||||
vec![param.message_modulus().0.ilog2(); num_block],
|
||||
);
|
||||
}
|
||||
|
||||
bench_group.finish()
|
||||
}
|
||||
|
||||
/// Base function to bench a server key function that is a binary operation, input ciphertext
|
||||
/// will contain only zero carries
|
||||
fn bench_cuda_server_key_binary_function_clean_inputs<F>(
|
||||
c: &mut Criterion,
|
||||
bench_name: &str,
|
||||
display_name: &str,
|
||||
binary_op: F,
|
||||
) where
|
||||
F: Fn(&CudaServerKey, &mut CudaRadixCiphertext, &mut CudaRadixCiphertext, &CudaStream),
|
||||
{
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
.sample_size(15)
|
||||
.measurement_time(std::time::Duration::from_secs(60));
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
let gpu_index = 0;
|
||||
let device = CudaDevice::new(gpu_index);
|
||||
let stream = CudaStream::new_unchecked(device);
|
||||
|
||||
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
|
||||
let param_name = param.name();
|
||||
|
||||
let bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
|
||||
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
let encrypt_two_values = || {
|
||||
let clearlow = rng.gen::<u128>();
|
||||
let clearhigh = rng.gen::<u128>();
|
||||
let clear_0 = tfhe::integer::U256::from((clearlow, clearhigh));
|
||||
let ct_0 = cks.encrypt_radix(clear_0, num_block);
|
||||
|
||||
let clearlow = rng.gen::<u128>();
|
||||
let clearhigh = rng.gen::<u128>();
|
||||
let clear_1 = tfhe::integer::U256::from((clearlow, clearhigh));
|
||||
let ct_1 = cks.encrypt_radix(clear_1, num_block);
|
||||
|
||||
let d_ctxt_1 = CudaRadixCiphertext::from_radix_ciphertext(&ct_0, &stream);
|
||||
let d_ctxt_2 = CudaRadixCiphertext::from_radix_ciphertext(&ct_1, &stream);
|
||||
|
||||
(d_ctxt_1, d_ctxt_2)
|
||||
};
|
||||
|
||||
b.iter_batched(
|
||||
encrypt_two_values,
|
||||
|(mut ct_0, mut ct_1)| {
|
||||
binary_op(&gpu_sks, &mut ct_0, &mut ct_1, &stream);
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
|
||||
write_to_json::<u64, _>(
|
||||
&bench_id,
|
||||
param,
|
||||
param.name(),
|
||||
display_name,
|
||||
&OperatorType::Atomic,
|
||||
bit_size as u32,
|
||||
vec![param.message_modulus().0.ilog2(); num_block],
|
||||
);
|
||||
}
|
||||
|
||||
bench_group.finish()
|
||||
}
|
||||
|
||||
fn bench_cuda_server_key_binary_scalar_function_clean_inputs<F, G>(
|
||||
c: &mut Criterion,
|
||||
bench_name: &str,
|
||||
display_name: &str,
|
||||
binary_op: F,
|
||||
rng_func: G,
|
||||
) where
|
||||
F: Fn(&CudaServerKey, &mut CudaRadixCiphertext, ScalarType, &CudaStream),
|
||||
G: Fn(&mut ThreadRng, usize) -> ScalarType,
|
||||
{
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
.sample_size(15)
|
||||
.measurement_time(std::time::Duration::from_secs(60));
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
let gpu_index = 0;
|
||||
let device = CudaDevice::new(gpu_index);
|
||||
let stream = CudaStream::new_unchecked(device);
|
||||
|
||||
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
|
||||
if bit_size > ScalarType::BITS as usize {
|
||||
break;
|
||||
}
|
||||
|
||||
let param_name = param.name();
|
||||
|
||||
let max_value_for_bit_size = ScalarType::MAX >> (ScalarType::BITS as usize - bit_size);
|
||||
|
||||
let bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits_scalar_{bit_size}");
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
let encrypt_one_value = || {
|
||||
let clearlow = rng.gen::<u128>();
|
||||
let clearhigh = rng.gen::<u128>();
|
||||
let clear_0 = tfhe::integer::U256::from((clearlow, clearhigh));
|
||||
let ct_0 = cks.encrypt_radix(clear_0, num_block);
|
||||
|
||||
let d_ctxt_1 = CudaRadixCiphertext::from_radix_ciphertext(&ct_0, &stream);
|
||||
|
||||
let clear_1 = rng_func(&mut rng, bit_size) & max_value_for_bit_size;
|
||||
|
||||
(d_ctxt_1, clear_1)
|
||||
};
|
||||
|
||||
b.iter_batched(
|
||||
encrypt_one_value,
|
||||
|(mut ct_0, clear_1)| {
|
||||
binary_op(&gpu_sks, &mut ct_0, clear_1, &stream);
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
|
||||
write_to_json::<u64, _>(
|
||||
&bench_id,
|
||||
param,
|
||||
param.name(),
|
||||
display_name,
|
||||
&OperatorType::Atomic,
|
||||
bit_size as u32,
|
||||
vec![param.message_modulus().0.ilog2(); num_block],
|
||||
);
|
||||
}
|
||||
|
||||
bench_group.finish()
|
||||
}
|
||||
|
||||
fn cuda_default_if_then_else(c: &mut Criterion) {
|
||||
let mut bench_group = c.benchmark_group("integer::cuda::if_then_else");
|
||||
bench_group
|
||||
.sample_size(15)
|
||||
.measurement_time(std::time::Duration::from_secs(60));
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
let gpu_index = 0;
|
||||
let device = CudaDevice::new(gpu_index);
|
||||
let stream = CudaStream::new_unchecked(device);
|
||||
|
||||
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
|
||||
if bit_size > ScalarType::BITS as usize {
|
||||
break;
|
||||
}
|
||||
|
||||
let param_name = param.name();
|
||||
|
||||
let bench_id = format!("if_then_else:{param_name}::{bit_size}_bits_scalar_{bit_size}");
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
let encrypt_tree_values = || {
|
||||
let clear_cond = rng.gen::<bool>();
|
||||
let ct_cond =
|
||||
cks.encrypt_radix(tfhe::integer::U256::from(clear_cond), num_block);
|
||||
|
||||
let clearlow = rng.gen::<u128>();
|
||||
let clearhigh = rng.gen::<u128>();
|
||||
let clear_0 = tfhe::integer::U256::from((clearlow, clearhigh));
|
||||
let ct_then = cks.encrypt_radix(clear_0, num_block);
|
||||
|
||||
let clearlow = rng.gen::<u128>();
|
||||
let clearhigh = rng.gen::<u128>();
|
||||
let clear_1 = tfhe::integer::U256::from((clearlow, clearhigh));
|
||||
let ct_else = cks.encrypt_radix(clear_1, num_block);
|
||||
|
||||
let d_ct_cond = CudaRadixCiphertext::from_radix_ciphertext(&ct_cond, &stream);
|
||||
let d_ct_then = CudaRadixCiphertext::from_radix_ciphertext(&ct_then, &stream);
|
||||
let d_ct_else = CudaRadixCiphertext::from_radix_ciphertext(&ct_else, &stream);
|
||||
|
||||
(d_ct_cond, d_ct_then, d_ct_else)
|
||||
};
|
||||
|
||||
b.iter_batched(
|
||||
encrypt_tree_values,
|
||||
|(ct_cond, ct_then, ct_else)| {
|
||||
let _ = gpu_sks.if_then_else(&ct_cond, &ct_then, &ct_else, &stream);
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
|
||||
write_to_json::<u64, _>(
|
||||
&bench_id,
|
||||
param,
|
||||
param.name(),
|
||||
"if_then_else",
|
||||
&OperatorType::Atomic,
|
||||
bit_size as u32,
|
||||
vec![param.message_modulus().0.ilog2(); num_block],
|
||||
);
|
||||
}
|
||||
|
||||
bench_group.finish()
|
||||
}
|
||||
|
||||
macro_rules! define_cuda_server_key_bench_clean_input_unary_fn (
|
||||
(method_name: $server_key_method:ident, display_name:$name:ident) => {
|
||||
::paste::paste!{
|
||||
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
|
||||
bench_cuda_server_key_unary_function_clean_inputs(
|
||||
c,
|
||||
concat!("integer::cuda::", stringify!($server_key_method)),
|
||||
stringify!($name),
|
||||
|server_key, lhs, stream| {
|
||||
server_key.$server_key_method(lhs, stream);
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
macro_rules! define_cuda_server_key_bench_clean_input_fn (
|
||||
(method_name: $server_key_method:ident, display_name:$name:ident) => {
|
||||
::paste::paste!{
|
||||
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
|
||||
bench_cuda_server_key_binary_function_clean_inputs(
|
||||
c,
|
||||
concat!("integer::cuda::", stringify!($server_key_method)),
|
||||
stringify!($name),
|
||||
|server_key, lhs, rhs, stream| {
|
||||
server_key.$server_key_method(lhs, rhs, stream);
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
macro_rules! define_cuda_server_key_bench_clean_input_scalar_fn (
|
||||
(method_name: $server_key_method:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => {
|
||||
::paste::paste!{
|
||||
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
|
||||
bench_cuda_server_key_binary_scalar_function_clean_inputs(
|
||||
c,
|
||||
concat!("integer::cuda::", stringify!($server_key_method)),
|
||||
stringify!($name),
|
||||
|server_key, lhs, rhs, stream| {
|
||||
server_key.$server_key_method(lhs, rhs, stream);
|
||||
},
|
||||
$($rng_fn)*
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
//===========================================
|
||||
// Unchecked
|
||||
//===========================================
|
||||
define_cuda_server_key_bench_clean_input_unary_fn!(
|
||||
method_name: unchecked_neg,
|
||||
display_name: negation
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: unchecked_bitand,
|
||||
display_name: bitand
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: unchecked_bitor,
|
||||
display_name: bitor
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: unchecked_bitxor,
|
||||
display_name: bitxor
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: unchecked_mul,
|
||||
display_name: mul
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: unchecked_add,
|
||||
display_name: add
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: unchecked_sub,
|
||||
display_name: sub
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: unchecked_eq,
|
||||
display_name: equal
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: unchecked_ne,
|
||||
display_name: not_equal
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_bitand,
|
||||
display_name: bitand,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_bitor,
|
||||
display_name: bitand,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_bitxor,
|
||||
display_name: bitand,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_add,
|
||||
display_name: add,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_sub,
|
||||
display_name: sub,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_left_shift,
|
||||
display_name: left_shift,
|
||||
rng_func: shift_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_right_shift,
|
||||
display_name: right_shift,
|
||||
rng_func: shift_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_left_rotate,
|
||||
display_name: left_rotate,
|
||||
rng_func: shift_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_right_rotate,
|
||||
display_name: right_rotate,
|
||||
rng_func: shift_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_gt,
|
||||
display_name: greater_than,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_ge,
|
||||
display_name: greater_or_equal,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_lt,
|
||||
display_name: less_than,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_le,
|
||||
display_name: less_or_equal,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_max,
|
||||
display_name: max,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: unchecked_scalar_min,
|
||||
display_name: min,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
//===========================================
|
||||
// Default
|
||||
//===========================================
|
||||
|
||||
define_cuda_server_key_bench_clean_input_unary_fn!(
|
||||
method_name: neg,
|
||||
display_name: negation
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: add,
|
||||
display_name: add
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: sub,
|
||||
display_name: sub
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: mul,
|
||||
display_name: mul
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: ne,
|
||||
display_name: not_equal
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: eq,
|
||||
display_name: equal
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: bitand,
|
||||
display_name: bitand
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: bitor,
|
||||
display_name: bitor
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: bitxor,
|
||||
display_name: bitxor
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: gt,
|
||||
display_name: greater_than
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: ge,
|
||||
display_name: greater_or_equal
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: lt,
|
||||
display_name: less_than
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: le,
|
||||
display_name: less_or_equal
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: max,
|
||||
display_name: max
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: min,
|
||||
display_name: min
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: scalar_sub,
|
||||
display_name: sub,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: scalar_add,
|
||||
display_name: add,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: scalar_left_shift,
|
||||
display_name: left_shift,
|
||||
rng_func: shift_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: scalar_right_shift,
|
||||
display_name: right_shift,
|
||||
rng_func: shift_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: scalar_bitand,
|
||||
display_name: bitand,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: scalar_bitor,
|
||||
display_name: bitor,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: scalar_bitxor,
|
||||
display_name: bitxor,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: scalar_gt,
|
||||
display_name: greater_than,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: scalar_ge,
|
||||
display_name: greater_or_equal,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: scalar_lt,
|
||||
display_name: less_than,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: scalar_le,
|
||||
display_name: less_or_equal,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: scalar_max,
|
||||
display_name: max,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_fn!(
|
||||
method_name: scalar_min,
|
||||
display_name: min,
|
||||
rng_func: default_scalar
|
||||
);
|
||||
|
||||
criterion_group!(
|
||||
unchecked_cuda_ops,
|
||||
cuda_unchecked_neg,
|
||||
cuda_unchecked_bitand,
|
||||
cuda_unchecked_bitor,
|
||||
cuda_unchecked_bitxor,
|
||||
cuda_unchecked_mul,
|
||||
cuda_unchecked_sub,
|
||||
cuda_unchecked_add,
|
||||
cuda_unchecked_eq,
|
||||
cuda_unchecked_ne,
|
||||
);
|
||||
|
||||
criterion_group!(
|
||||
unchecked_scalar_cuda_ops,
|
||||
cuda_unchecked_scalar_bitand,
|
||||
cuda_unchecked_scalar_bitor,
|
||||
cuda_unchecked_scalar_bitxor,
|
||||
cuda_unchecked_scalar_add,
|
||||
cuda_unchecked_scalar_sub,
|
||||
cuda_unchecked_scalar_left_shift,
|
||||
cuda_unchecked_scalar_right_shift,
|
||||
cuda_unchecked_scalar_left_rotate,
|
||||
cuda_unchecked_scalar_right_rotate,
|
||||
cuda_unchecked_scalar_ge,
|
||||
cuda_unchecked_scalar_gt,
|
||||
cuda_unchecked_scalar_le,
|
||||
cuda_unchecked_scalar_lt,
|
||||
cuda_unchecked_scalar_max,
|
||||
cuda_unchecked_scalar_min,
|
||||
);
|
||||
|
||||
criterion_group!(
|
||||
default_cuda_ops,
|
||||
cuda_neg,
|
||||
cuda_sub,
|
||||
cuda_add,
|
||||
cuda_mul,
|
||||
cuda_eq,
|
||||
cuda_ne,
|
||||
cuda_ge,
|
||||
cuda_gt,
|
||||
cuda_le,
|
||||
cuda_lt,
|
||||
cuda_max,
|
||||
cuda_min,
|
||||
cuda_bitand,
|
||||
cuda_bitor,
|
||||
cuda_bitxor,
|
||||
cuda_default_if_then_else,
|
||||
);
|
||||
|
||||
criterion_group!(
|
||||
default_scalar_cuda_ops,
|
||||
cuda_scalar_sub,
|
||||
cuda_scalar_add,
|
||||
cuda_scalar_left_shift,
|
||||
cuda_scalar_right_shift,
|
||||
cuda_scalar_bitand,
|
||||
cuda_scalar_bitor,
|
||||
cuda_scalar_bitxor,
|
||||
cuda_scalar_ge,
|
||||
cuda_scalar_gt,
|
||||
cuda_scalar_le,
|
||||
cuda_scalar_lt,
|
||||
cuda_scalar_max,
|
||||
cuda_scalar_min,
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
use cuda::{
|
||||
default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops, unchecked_scalar_cuda_ops,
|
||||
};
|
||||
|
||||
criterion_group!(
|
||||
smart_ops,
|
||||
smart_neg,
|
||||
@@ -1371,35 +2092,56 @@ criterion_group!(
|
||||
|
||||
criterion_group!(misc, full_propagate, full_propagate_parallelized);
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
fn go_through_gpu_bench_groups(val: &str) {
|
||||
match val.to_lowercase().as_str() {
|
||||
"default" => {
|
||||
default_cuda_ops();
|
||||
default_scalar_cuda_ops()
|
||||
}
|
||||
"unchecked" => {
|
||||
unchecked_cuda_ops();
|
||||
unchecked_scalar_cuda_ops()
|
||||
}
|
||||
_ => panic!("unknown benchmark operations flavor"),
|
||||
};
|
||||
}
|
||||
|
||||
fn go_through_cpu_bench_groups(val: &str) {
|
||||
match val.to_lowercase().as_str() {
|
||||
"default" => {
|
||||
default_parallelized_ops();
|
||||
default_parallelized_ops_comp();
|
||||
default_scalar_parallelized_ops();
|
||||
default_scalar_parallelized_ops_comp()
|
||||
}
|
||||
"smart" => {
|
||||
smart_ops();
|
||||
smart_ops_comp();
|
||||
smart_scalar_ops();
|
||||
smart_parallelized_ops();
|
||||
smart_parallelized_ops_comp();
|
||||
smart_scalar_parallelized_ops();
|
||||
smart_scalar_parallelized_ops_comp()
|
||||
}
|
||||
"unchecked" => {
|
||||
unchecked_ops();
|
||||
unchecked_parallelized_ops();
|
||||
unchecked_ops_comp();
|
||||
unchecked_scalar_ops();
|
||||
unchecked_scalar_ops_comp()
|
||||
}
|
||||
"misc" => misc(),
|
||||
_ => panic!("unknown benchmark operations flavor"),
|
||||
};
|
||||
}
|
||||
fn main() {
|
||||
match env::var("__TFHE_RS_BENCH_OP_FLAVOR") {
|
||||
Ok(val) => {
|
||||
match val.to_lowercase().as_str() {
|
||||
"default" => {
|
||||
default_parallelized_ops();
|
||||
default_parallelized_ops_comp();
|
||||
default_scalar_parallelized_ops();
|
||||
default_scalar_parallelized_ops_comp()
|
||||
}
|
||||
"smart" => {
|
||||
smart_ops();
|
||||
smart_ops_comp();
|
||||
smart_scalar_ops();
|
||||
smart_parallelized_ops();
|
||||
smart_parallelized_ops_comp();
|
||||
smart_scalar_parallelized_ops();
|
||||
smart_scalar_parallelized_ops_comp()
|
||||
}
|
||||
"unchecked" => {
|
||||
unchecked_ops();
|
||||
unchecked_parallelized_ops();
|
||||
unchecked_ops_comp();
|
||||
unchecked_scalar_ops();
|
||||
unchecked_scalar_ops_comp()
|
||||
}
|
||||
"misc" => misc(),
|
||||
_ => panic!("unknown benchmark operations flavor"),
|
||||
};
|
||||
#[cfg(feature = "gpu")]
|
||||
go_through_gpu_bench_groups(&val);
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
go_through_cpu_bench_groups(&val);
|
||||
}
|
||||
Err(_) => {
|
||||
default_parallelized_ops();
|
||||
|
||||
@@ -1,6 +1,194 @@
|
||||
use super::utils::*;
|
||||
use std::os::raw::c_int;
|
||||
|
||||
#[no_mangle]
|
||||
pub unsafe extern "C" fn core_crypto_lwe_secret_key(
|
||||
output_lwe_sk_ptr: *mut u64,
|
||||
lwe_sk_dim: usize,
|
||||
seed_low_bytes: u64,
|
||||
seed_high_bytes: u64,
|
||||
) -> c_int {
|
||||
catch_panic(|| {
|
||||
use crate::core_crypto::commons::math::random::Seed;
|
||||
use crate::core_crypto::prelude::*;
|
||||
|
||||
let seed_low_bytes: u128 = seed_low_bytes.into();
|
||||
let seed_high_bytes: u128 = seed_high_bytes.into();
|
||||
let seed = (seed_high_bytes << 64) | seed_low_bytes;
|
||||
|
||||
let mut secret_generator =
|
||||
SecretRandomGenerator::<ActivatedRandomGenerator>::new(Seed(seed));
|
||||
|
||||
// Create the LweSecretKey
|
||||
let output_lwe_sk_slice = std::slice::from_raw_parts_mut(output_lwe_sk_ptr, lwe_sk_dim);
|
||||
|
||||
let mut lwe_sk = LweSecretKey::from_container(output_lwe_sk_slice);
|
||||
|
||||
generate_binary_lwe_secret_key(&mut lwe_sk, &mut secret_generator);
|
||||
})
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub unsafe extern "C" fn core_crypto_lwe_encrypt(
|
||||
output_ct_ptr: *mut u64,
|
||||
pt: u64,
|
||||
lwe_sk_ptr: *const u64,
|
||||
lwe_sk_dim: usize,
|
||||
lwe_encryption_std_dev: f64,
|
||||
seed_low_bytes: u64,
|
||||
seed_high_bytes: u64,
|
||||
) -> c_int {
|
||||
catch_panic(|| {
|
||||
use crate::core_crypto::commons::generators::DeterministicSeeder;
|
||||
use crate::core_crypto::commons::math::random::Seed;
|
||||
use crate::core_crypto::prelude::*;
|
||||
|
||||
let lwe_sk_slice = std::slice::from_raw_parts(lwe_sk_ptr, lwe_sk_dim);
|
||||
let lwe_sk = LweSecretKey::from_container(lwe_sk_slice);
|
||||
|
||||
let seed_low_bytes: u128 = seed_low_bytes.into();
|
||||
let seed_high_bytes: u128 = seed_high_bytes.into();
|
||||
let seed = (seed_high_bytes << 64) | seed_low_bytes;
|
||||
|
||||
let seed = Seed(seed);
|
||||
let mut determinisitic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
|
||||
let mut encryption_generator = EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
|
||||
determinisitic_seeder.seed(),
|
||||
&mut determinisitic_seeder,
|
||||
);
|
||||
|
||||
let plaintext = Plaintext(pt);
|
||||
let output_ct = std::slice::from_raw_parts_mut(output_ct_ptr, lwe_sk_dim + 1);
|
||||
let mut ct = LweCiphertext::from_container(output_ct, CiphertextModulus::new_native());
|
||||
|
||||
let lwe_encryption_std_dev = StandardDev(lwe_encryption_std_dev);
|
||||
|
||||
encrypt_lwe_ciphertext(
|
||||
&lwe_sk,
|
||||
&mut ct,
|
||||
plaintext,
|
||||
lwe_encryption_std_dev,
|
||||
&mut encryption_generator,
|
||||
);
|
||||
})
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub unsafe extern "C" fn core_crypto_ggsw_encrypt(
|
||||
output_ct_ptr: *mut u64,
|
||||
pt: u64,
|
||||
glwe_sk_ptr: *const u64,
|
||||
glwe_sk_dim: usize,
|
||||
poly_size: usize,
|
||||
level_count: usize,
|
||||
base_log: usize,
|
||||
glwe_modular_variance: f64,
|
||||
seed_low_bytes: u64,
|
||||
seed_high_bytes: u64,
|
||||
) -> c_int {
|
||||
catch_panic(|| {
|
||||
use crate::core_crypto::commons::generators::DeterministicSeeder;
|
||||
use crate::core_crypto::commons::math::random::Seed;
|
||||
use crate::core_crypto::prelude::*;
|
||||
|
||||
let glwe_sk_slice = std::slice::from_raw_parts(glwe_sk_ptr, glwe_sk_dim * poly_size);
|
||||
let glwe_sk = GlweSecretKey::from_container(glwe_sk_slice, PolynomialSize(poly_size));
|
||||
|
||||
let seed_low_bytes: u128 = seed_low_bytes.into();
|
||||
let seed_high_bytes: u128 = seed_high_bytes.into();
|
||||
let seed = (seed_high_bytes << 64) | seed_low_bytes;
|
||||
|
||||
let seed = Seed(seed);
|
||||
let mut determinisitic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
|
||||
let mut encryption_generator = EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
|
||||
determinisitic_seeder.seed(),
|
||||
&mut determinisitic_seeder,
|
||||
);
|
||||
|
||||
let plaintext = Plaintext(pt);
|
||||
let output_ct = std::slice::from_raw_parts_mut(
|
||||
output_ct_ptr,
|
||||
ggsw_ciphertext_size(
|
||||
GlweDimension(glwe_sk_dim).to_glwe_size(),
|
||||
PolynomialSize(poly_size),
|
||||
DecompositionLevelCount(level_count),
|
||||
),
|
||||
);
|
||||
let mut ct = GgswCiphertext::from_container(
|
||||
output_ct,
|
||||
GlweDimension(glwe_sk_dim).to_glwe_size(),
|
||||
PolynomialSize(poly_size),
|
||||
DecompositionBaseLog(base_log),
|
||||
CiphertextModulus::new_native(),
|
||||
);
|
||||
|
||||
let glwe_encryption_std_dev = StandardDev(glwe_modular_variance);
|
||||
|
||||
encrypt_constant_ggsw_ciphertext(
|
||||
&glwe_sk,
|
||||
&mut ct,
|
||||
plaintext,
|
||||
glwe_encryption_std_dev,
|
||||
&mut encryption_generator,
|
||||
);
|
||||
})
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub unsafe extern "C" fn core_crypto_lwe_decrypt(
|
||||
output_pt: *mut u64,
|
||||
input_ct_ptr: *const u64,
|
||||
lwe_sk_ptr: *const u64,
|
||||
lwe_sk_dim: usize,
|
||||
) -> c_int {
|
||||
catch_panic(|| {
|
||||
use crate::core_crypto::prelude::*;
|
||||
|
||||
let lwe_sk_slice = std::slice::from_raw_parts(lwe_sk_ptr, lwe_sk_dim);
|
||||
let lwe_sk = LweSecretKey::from_container(lwe_sk_slice);
|
||||
|
||||
let input_ct = std::slice::from_raw_parts(input_ct_ptr, lwe_sk_dim + 1);
|
||||
let ct = LweCiphertext::from_container(input_ct, CiphertextModulus::new_native());
|
||||
|
||||
let plaintext = decrypt_lwe_ciphertext(&lwe_sk, &ct);
|
||||
|
||||
*output_pt = plaintext.0;
|
||||
})
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub unsafe extern "C" fn core_crypto_glwe_decrypt(
|
||||
output_pt: *mut u64,
|
||||
input_ct_ptr: *const u64,
|
||||
glwe_sk_ptr: *const u64,
|
||||
glwe_sk_dim: usize,
|
||||
glwe_poly_size: usize,
|
||||
) -> c_int {
|
||||
catch_panic(|| {
|
||||
use crate::core_crypto::prelude::*;
|
||||
|
||||
let glwe_sk_slice = std::slice::from_raw_parts(glwe_sk_ptr, glwe_sk_dim * glwe_poly_size);
|
||||
let glwe_sk = GlweSecretKey::from_container(glwe_sk_slice, PolynomialSize(glwe_poly_size));
|
||||
|
||||
let input_ct = std::slice::from_raw_parts(
|
||||
input_ct_ptr,
|
||||
glwe_ciphertext_size(
|
||||
GlweDimension(glwe_sk_dim).to_glwe_size(),
|
||||
PolynomialSize(glwe_poly_size),
|
||||
),
|
||||
);
|
||||
let ct = GlweCiphertext::from_container(
|
||||
input_ct,
|
||||
PolynomialSize(glwe_poly_size),
|
||||
CiphertextModulus::new_native(),
|
||||
);
|
||||
let output = std::slice::from_raw_parts_mut(output_pt, glwe_poly_size);
|
||||
let mut plaintext_list = PlaintextList::from_container(output);
|
||||
|
||||
decrypt_glwe_ciphertext(&glwe_sk, &ct, &mut plaintext_list);
|
||||
})
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub unsafe extern "C" fn core_crypto_lwe_multi_bit_bootstrapping_key_element_size(
|
||||
input_lwe_sk_dim: usize,
|
||||
@@ -34,6 +222,88 @@ pub unsafe extern "C" fn core_crypto_lwe_multi_bit_bootstrapping_key_element_siz
|
||||
})
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub unsafe extern "C" fn core_crypto_par_generate_lwe_bootstrapping_key(
|
||||
output_bsk_ptr: *mut u64,
|
||||
bsk_base_log: usize,
|
||||
bsk_level_count: usize,
|
||||
input_lwe_sk_ptr: *const u64,
|
||||
input_lwe_sk_dim: usize,
|
||||
output_glwe_sk_ptr: *const u64,
|
||||
output_glwe_sk_dim: usize,
|
||||
output_glwe_sk_poly_size: usize,
|
||||
glwe_encryption_std_dev: f64,
|
||||
seed_low_bytes: u64,
|
||||
seed_high_bytes: u64,
|
||||
) -> c_int {
|
||||
catch_panic(|| {
|
||||
use crate::core_crypto::commons::generators::DeterministicSeeder;
|
||||
use crate::core_crypto::commons::math::random::Seed;
|
||||
use crate::core_crypto::prelude::*;
|
||||
|
||||
let input_lwe_sk_slice = std::slice::from_raw_parts(input_lwe_sk_ptr, input_lwe_sk_dim);
|
||||
let input_lwe_sk = LweSecretKey::from_container(input_lwe_sk_slice);
|
||||
|
||||
let output_glwe_sk_dim = GlweDimension(output_glwe_sk_dim);
|
||||
let output_glwe_sk_poly_size = PolynomialSize(output_glwe_sk_poly_size);
|
||||
let output_glwe_sk_size =
|
||||
glwe_ciphertext_mask_size(output_glwe_sk_dim, output_glwe_sk_poly_size);
|
||||
let output_glwe_sk_slice =
|
||||
std::slice::from_raw_parts(output_glwe_sk_ptr, output_glwe_sk_size);
|
||||
let output_glwe_sk =
|
||||
GlweSecretKey::from_container(output_glwe_sk_slice, output_glwe_sk_poly_size);
|
||||
|
||||
let seed_low_bytes: u128 = seed_low_bytes.into();
|
||||
let seed_high_bytes: u128 = seed_high_bytes.into();
|
||||
let seed = (seed_high_bytes << 64) | seed_low_bytes;
|
||||
|
||||
let mut deterministic_seeder =
|
||||
DeterministicSeeder::<ActivatedRandomGenerator>::new(Seed(seed));
|
||||
let mut encryption_random_generator =
|
||||
EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
|
||||
deterministic_seeder.seed(),
|
||||
&mut deterministic_seeder,
|
||||
);
|
||||
|
||||
let lwe_base_log = DecompositionBaseLog(bsk_base_log);
|
||||
let lwe_level_count = DecompositionLevelCount(bsk_level_count);
|
||||
|
||||
let lwe_slice_len = {
|
||||
let bsk = LweBootstrapKeyOwned::new(
|
||||
0u64,
|
||||
output_glwe_sk.glwe_dimension().to_glwe_size(),
|
||||
output_glwe_sk.polynomial_size(),
|
||||
lwe_base_log,
|
||||
lwe_level_count,
|
||||
input_lwe_sk.lwe_dimension(),
|
||||
CiphertextModulus::new_native(),
|
||||
);
|
||||
bsk.into_container().len()
|
||||
};
|
||||
|
||||
let bsk_slice = std::slice::from_raw_parts_mut(output_bsk_ptr, lwe_slice_len);
|
||||
|
||||
let mut bsk = LweBootstrapKey::from_container(
|
||||
bsk_slice,
|
||||
output_glwe_sk.glwe_dimension().to_glwe_size(),
|
||||
output_glwe_sk.polynomial_size(),
|
||||
lwe_base_log,
|
||||
lwe_level_count,
|
||||
CiphertextModulus::new_native(),
|
||||
);
|
||||
|
||||
let glwe_encryption_std_dev = StandardDev(glwe_encryption_std_dev);
|
||||
|
||||
par_generate_lwe_bootstrap_key(
|
||||
&input_lwe_sk,
|
||||
&output_glwe_sk,
|
||||
&mut bsk,
|
||||
glwe_encryption_std_dev,
|
||||
&mut encryption_random_generator,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub unsafe extern "C" fn core_crypto_par_generate_lwe_multi_bit_bootstrapping_key(
|
||||
input_lwe_sk_ptr: *const u64,
|
||||
@@ -120,3 +390,151 @@ pub unsafe extern "C" fn core_crypto_par_generate_lwe_multi_bit_bootstrapping_ke
|
||||
);
|
||||
})
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub unsafe extern "C" fn core_crypto_par_generate_lwe_keyswitch_key(
|
||||
output_ksk_ptr: *mut u64,
|
||||
ksk_base_log: usize,
|
||||
ksk_level_count: usize,
|
||||
input_lwe_sk_ptr: *const u64,
|
||||
input_lwe_sk_dim: usize,
|
||||
output_lwe_sk_ptr: *const u64,
|
||||
output_lwe_sk_dim: usize,
|
||||
lwe_encryption_std_dev: f64,
|
||||
seed_low_bytes: u64,
|
||||
seed_high_bytes: u64,
|
||||
) -> c_int {
|
||||
catch_panic(|| {
|
||||
use crate::core_crypto::commons::generators::DeterministicSeeder;
|
||||
use crate::core_crypto::commons::math::random::Seed;
|
||||
use crate::core_crypto::prelude::*;
|
||||
|
||||
let input_lwe_sk_slice = std::slice::from_raw_parts(input_lwe_sk_ptr, input_lwe_sk_dim);
|
||||
let input_lwe_sk = LweSecretKey::from_container(input_lwe_sk_slice);
|
||||
let output_lwe_sk_slice = std::slice::from_raw_parts(output_lwe_sk_ptr, output_lwe_sk_dim);
|
||||
let output_lwe_sk = LweSecretKey::from_container(output_lwe_sk_slice);
|
||||
|
||||
let seed_low_bytes: u128 = seed_low_bytes.into();
|
||||
let seed_high_bytes: u128 = seed_high_bytes.into();
|
||||
let seed = (seed_high_bytes << 64) | seed_low_bytes;
|
||||
|
||||
let mut deterministic_seeder =
|
||||
DeterministicSeeder::<ActivatedRandomGenerator>::new(Seed(seed));
|
||||
let mut encryption_random_generator =
|
||||
EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
|
||||
deterministic_seeder.seed(),
|
||||
&mut deterministic_seeder,
|
||||
);
|
||||
|
||||
let lwe_base_log = DecompositionBaseLog(ksk_base_log);
|
||||
let lwe_level_count = DecompositionLevelCount(ksk_level_count);
|
||||
|
||||
let lwe_slice_len = {
|
||||
let bsk = LweKeyswitchKeyOwned::new(
|
||||
0u64,
|
||||
lwe_base_log,
|
||||
lwe_level_count,
|
||||
LweDimension(input_lwe_sk_dim),
|
||||
LweDimension(output_lwe_sk_dim),
|
||||
CiphertextModulus::new_native(),
|
||||
);
|
||||
bsk.into_container().len()
|
||||
};
|
||||
|
||||
let ksk_slice = std::slice::from_raw_parts_mut(output_ksk_ptr, lwe_slice_len);
|
||||
|
||||
let mut ksk = LweKeyswitchKey::from_container(
|
||||
ksk_slice,
|
||||
lwe_base_log,
|
||||
lwe_level_count,
|
||||
LweDimension(output_lwe_sk_dim).to_lwe_size(),
|
||||
CiphertextModulus::new_native(),
|
||||
);
|
||||
|
||||
let lwe_encryption_std_dev = StandardDev(lwe_encryption_std_dev);
|
||||
|
||||
generate_lwe_keyswitch_key(
|
||||
&input_lwe_sk,
|
||||
&output_lwe_sk,
|
||||
&mut ksk,
|
||||
lwe_encryption_std_dev,
|
||||
&mut encryption_random_generator,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub unsafe extern "C" fn core_crypto_par_generate_lwe_private_functional_keyswitch_key(
|
||||
output_pksk_ptr: *mut u64,
|
||||
pksk_base_log: usize,
|
||||
pksk_level_count: usize,
|
||||
input_lwe_sk_ptr: *const u64,
|
||||
input_lwe_sk_dim: usize,
|
||||
output_glwe_sk_ptr: *const u64,
|
||||
poly_size: usize,
|
||||
glwe_dim: usize,
|
||||
lwe_encryption_std_dev: f64,
|
||||
seed_low_bytes: u64,
|
||||
seed_high_bytes: u64,
|
||||
) -> c_int {
|
||||
catch_panic(|| {
|
||||
use crate::core_crypto::commons::generators::DeterministicSeeder;
|
||||
use crate::core_crypto::commons::math::random::Seed;
|
||||
use crate::core_crypto::prelude::*;
|
||||
|
||||
let input_lwe_sk_slice = std::slice::from_raw_parts(input_lwe_sk_ptr, input_lwe_sk_dim);
|
||||
let input_lwe_sk = LweSecretKey::from_container(input_lwe_sk_slice);
|
||||
let output_glwe_sk_slice =
|
||||
std::slice::from_raw_parts(output_glwe_sk_ptr, glwe_dim * poly_size);
|
||||
let output_glwe_sk =
|
||||
GlweSecretKey::from_container(output_glwe_sk_slice, PolynomialSize(poly_size));
|
||||
|
||||
let seed_low_bytes: u128 = seed_low_bytes.into();
|
||||
let seed_high_bytes: u128 = seed_high_bytes.into();
|
||||
let seed = (seed_high_bytes << 64) | seed_low_bytes;
|
||||
|
||||
let mut deterministic_seeder =
|
||||
DeterministicSeeder::<ActivatedRandomGenerator>::new(Seed(seed));
|
||||
let mut encryption_random_generator =
|
||||
EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
|
||||
deterministic_seeder.seed(),
|
||||
&mut deterministic_seeder,
|
||||
);
|
||||
|
||||
let pksk_len = {
|
||||
let ksk = LwePrivateFunctionalPackingKeyswitchKeyList::new(
|
||||
0u64,
|
||||
DecompositionBaseLog(pksk_base_log),
|
||||
DecompositionLevelCount(pksk_level_count),
|
||||
LweDimension(input_lwe_sk_dim),
|
||||
GlweDimension(glwe_dim).to_glwe_size(),
|
||||
PolynomialSize(poly_size),
|
||||
FunctionalPackingKeyswitchKeyCount(glwe_dim + 1),
|
||||
CiphertextModulus::new_native(),
|
||||
);
|
||||
ksk.into_container().len()
|
||||
};
|
||||
|
||||
let ksk_slice = std::slice::from_raw_parts_mut(output_pksk_ptr, pksk_len);
|
||||
|
||||
let mut fp_ksk = LwePrivateFunctionalPackingKeyswitchKeyList::from_container(
|
||||
ksk_slice,
|
||||
DecompositionBaseLog(pksk_base_log),
|
||||
DecompositionLevelCount(pksk_level_count),
|
||||
LweDimension(input_lwe_sk_dim).to_lwe_size(),
|
||||
GlweDimension(glwe_dim).to_glwe_size(),
|
||||
PolynomialSize(poly_size),
|
||||
CiphertextModulus::new_native(),
|
||||
);
|
||||
|
||||
let lwe_encryption_std_dev = StandardDev(lwe_encryption_std_dev);
|
||||
|
||||
generate_circuit_bootstrap_lwe_pfpksk_list(
|
||||
&mut fp_ksk,
|
||||
&input_lwe_sk,
|
||||
&output_glwe_sk,
|
||||
lwe_encryption_std_dev,
|
||||
&mut encryption_random_generator,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -332,7 +332,7 @@ pub fn round_decode<Scalar: UnsignedInteger>(decrypted: Scalar, delta: Scalar) -
|
||||
}
|
||||
|
||||
// Here we will define a helper function to generate an accumulator for a PBS
|
||||
fn generate_accumulator<F, Scalar: UnsignedTorus + CastFrom<usize>>(
|
||||
pub(crate) fn generate_accumulator<F, Scalar: UnsignedTorus + CastFrom<usize>>(
|
||||
polynomial_size: PolynomialSize,
|
||||
glwe_size: GlweSize,
|
||||
message_modulus: usize,
|
||||
|
||||
@@ -23,6 +23,11 @@ pub struct CiphertextCount(pub usize);
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug, Serialize, Deserialize)]
|
||||
pub struct LweCiphertextCount(pub usize);
|
||||
|
||||
/// The index of a ciphertext in an lwe ciphertext list.
|
||||
#[cfg(feature = "gpu")]
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug, Serialize, Deserialize)]
|
||||
pub struct LweCiphertextIndex(pub usize);
|
||||
|
||||
/// The number of ciphertexts in a glwe ciphertext list.
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug, Serialize, Deserialize)]
|
||||
pub struct GlweCiphertextCount(pub usize);
|
||||
|
||||
73
tfhe/src/core_crypto/gpu/algorithms/lwe_keyswitch.rs
Normal file
73
tfhe/src/core_crypto/gpu/algorithms/lwe_keyswitch.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::lwe_keyswitch_key::CudaLweKeyswitchKey;
|
||||
use crate::core_crypto::gpu::vec::CudaVec;
|
||||
use crate::core_crypto::gpu::CudaStream;
|
||||
use crate::core_crypto::prelude::UnsignedInteger;
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until stream is synchronised
|
||||
pub unsafe fn cuda_keyswitch_lwe_ciphertext_async<Scalar>(
|
||||
lwe_keyswitch_key: &CudaLweKeyswitchKey<Scalar>,
|
||||
input_lwe_ciphertext: &CudaLweCiphertextList<Scalar>,
|
||||
output_lwe_ciphertext: &mut CudaLweCiphertextList<Scalar>,
|
||||
input_indexes: &CudaVec<Scalar>,
|
||||
output_indexes: &CudaVec<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
assert!(
|
||||
lwe_keyswitch_key.input_key_lwe_size().to_lwe_dimension()
|
||||
== input_lwe_ciphertext.lwe_dimension(),
|
||||
"Mismatched input LweDimension. \
|
||||
LweKeyswitchKey input LweDimension: {:?}, input LweCiphertext LweDimension {:?}.",
|
||||
lwe_keyswitch_key.input_key_lwe_size().to_lwe_dimension(),
|
||||
input_lwe_ciphertext.lwe_dimension(),
|
||||
);
|
||||
assert!(
|
||||
lwe_keyswitch_key.output_key_lwe_size().to_lwe_dimension()
|
||||
== output_lwe_ciphertext.lwe_dimension(),
|
||||
"Mismatched output LweDimension. \
|
||||
LweKeyswitchKey output LweDimension: {:?}, output LweCiphertext LweDimension {:?}.",
|
||||
lwe_keyswitch_key.output_key_lwe_size().to_lwe_dimension(),
|
||||
output_lwe_ciphertext.lwe_dimension(),
|
||||
);
|
||||
|
||||
stream.keyswitch_async(
|
||||
&mut output_lwe_ciphertext.0.d_vec,
|
||||
output_indexes,
|
||||
&input_lwe_ciphertext.0.d_vec,
|
||||
input_indexes,
|
||||
lwe_keyswitch_key.input_key_lwe_size().to_lwe_dimension(),
|
||||
lwe_keyswitch_key.output_key_lwe_size().to_lwe_dimension(),
|
||||
&lwe_keyswitch_key.d_vec,
|
||||
lwe_keyswitch_key.decomposition_base_log(),
|
||||
lwe_keyswitch_key.decomposition_level_count(),
|
||||
input_lwe_ciphertext.lwe_ciphertext_count().0 as u32,
|
||||
);
|
||||
}
|
||||
|
||||
pub fn cuda_keyswitch_lwe_ciphertext<Scalar>(
|
||||
lwe_keyswitch_key: &CudaLweKeyswitchKey<Scalar>,
|
||||
input_lwe_ciphertext: &CudaLweCiphertextList<Scalar>,
|
||||
output_lwe_ciphertext: &mut CudaLweCiphertextList<Scalar>,
|
||||
input_indexes: &CudaVec<Scalar>,
|
||||
output_indexes: &CudaVec<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
unsafe {
|
||||
cuda_keyswitch_lwe_ciphertext_async(
|
||||
lwe_keyswitch_key,
|
||||
input_lwe_ciphertext,
|
||||
output_lwe_ciphertext,
|
||||
input_indexes,
|
||||
output_indexes,
|
||||
stream,
|
||||
);
|
||||
}
|
||||
stream.synchronize();
|
||||
}
|
||||
361
tfhe/src/core_crypto/gpu/algorithms/lwe_linear_algebra.rs
Normal file
361
tfhe/src/core_crypto/gpu/algorithms/lwe_linear_algebra.rs
Normal file
@@ -0,0 +1,361 @@
|
||||
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::vec::CudaVec;
|
||||
use crate::core_crypto::gpu::CudaStream;
|
||||
use crate::core_crypto::prelude::UnsignedInteger;
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until stream is synchronised
|
||||
pub unsafe fn cuda_lwe_ciphertext_add_async<Scalar>(
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
lhs: &CudaLweCiphertextList<Scalar>,
|
||||
rhs: &CudaLweCiphertextList<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
let num_samples = output.lwe_ciphertext_count().0 as u32;
|
||||
|
||||
assert_eq!(
|
||||
lhs.lwe_ciphertext_count(),
|
||||
rhs.lwe_ciphertext_count(),
|
||||
"Mismatched number of ciphertexts between lhs ({:?}) and rhs ({:?})",
|
||||
lhs.lwe_ciphertext_count(),
|
||||
rhs.lwe_ciphertext_count()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
output.lwe_ciphertext_count(),
|
||||
rhs.lwe_ciphertext_count(),
|
||||
"Mismatched number of ciphertexts between output ({:?}) and rhs ({:?})",
|
||||
output.lwe_ciphertext_count(),
|
||||
rhs.lwe_ciphertext_count()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
lhs.ciphertext_modulus(),
|
||||
rhs.ciphertext_modulus(),
|
||||
"Mismatched moduli between lhs ({:?}) and rhs ({:?}) LweCiphertext",
|
||||
lhs.ciphertext_modulus(),
|
||||
rhs.ciphertext_modulus()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
output.ciphertext_modulus(),
|
||||
rhs.ciphertext_modulus(),
|
||||
"Mismatched moduli between output ({:?}) and rhs ({:?}) LweCiphertext",
|
||||
output.ciphertext_modulus(),
|
||||
rhs.ciphertext_modulus()
|
||||
);
|
||||
|
||||
stream.add_lwe_ciphertext_vector_async(
|
||||
&mut output.0.d_vec,
|
||||
&lhs.0.d_vec,
|
||||
&rhs.0.d_vec,
|
||||
lhs.lwe_dimension(),
|
||||
num_samples,
|
||||
);
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until stream is synchronised
|
||||
pub unsafe fn cuda_lwe_ciphertext_add_assign_async<Scalar>(
|
||||
lhs: &mut CudaLweCiphertextList<Scalar>,
|
||||
rhs: &CudaLweCiphertextList<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
let num_samples = lhs.lwe_ciphertext_count().0 as u32;
|
||||
|
||||
assert_eq!(
|
||||
lhs.lwe_ciphertext_count(),
|
||||
rhs.lwe_ciphertext_count(),
|
||||
"Mismatched number of ciphertexts between lhs ({:?}) and rhs ({:?})",
|
||||
lhs.lwe_ciphertext_count(),
|
||||
rhs.lwe_ciphertext_count()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
lhs.ciphertext_modulus(),
|
||||
rhs.ciphertext_modulus(),
|
||||
"Mismatched moduli between lhs ({:?}) and rhs ({:?}) LweCiphertext",
|
||||
lhs.ciphertext_modulus(),
|
||||
rhs.ciphertext_modulus()
|
||||
);
|
||||
|
||||
stream.add_lwe_ciphertext_vector_assign_async(
|
||||
&mut lhs.0.d_vec,
|
||||
&rhs.0.d_vec,
|
||||
rhs.lwe_dimension(),
|
||||
num_samples,
|
||||
);
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until stream is synchronised
|
||||
pub unsafe fn cuda_lwe_ciphertext_plaintext_add_async<Scalar>(
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
lhs: &CudaLweCiphertextList<Scalar>,
|
||||
rhs: &CudaVec<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
let num_samples = output.lwe_ciphertext_count().0 as u32;
|
||||
|
||||
assert_eq!(
|
||||
output.lwe_ciphertext_count(),
|
||||
lhs.lwe_ciphertext_count(),
|
||||
"Mismatched number of ciphertexts between output ({:?}) and lhs ({:?})",
|
||||
output.lwe_ciphertext_count(),
|
||||
lhs.lwe_ciphertext_count()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
output.ciphertext_modulus(),
|
||||
lhs.ciphertext_modulus(),
|
||||
"Mismatched moduli between output ({:?}) and lhs ({:?}) LweCiphertext",
|
||||
output.ciphertext_modulus(),
|
||||
lhs.ciphertext_modulus()
|
||||
);
|
||||
|
||||
stream.add_lwe_ciphertext_vector_plaintext_vector_async(
|
||||
&mut output.0.d_vec,
|
||||
&lhs.0.d_vec,
|
||||
rhs,
|
||||
lhs.lwe_dimension(),
|
||||
num_samples,
|
||||
);
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until stream is synchronised
|
||||
pub unsafe fn cuda_lwe_ciphertext_plaintext_add_assign_async<Scalar>(
|
||||
lhs: &mut CudaLweCiphertextList<Scalar>,
|
||||
rhs: &CudaVec<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
let num_samples = lhs.lwe_ciphertext_count().0 as u32;
|
||||
let lwe_dimension = &lhs.lwe_dimension();
|
||||
|
||||
stream.add_lwe_ciphertext_vector_plaintext_vector_assign_async(
|
||||
&mut lhs.0.d_vec,
|
||||
rhs,
|
||||
*lwe_dimension,
|
||||
num_samples,
|
||||
);
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until stream is synchronised
|
||||
pub unsafe fn cuda_lwe_ciphertext_negate_async<Scalar>(
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
input: &CudaLweCiphertextList<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
assert_eq!(
|
||||
input.lwe_ciphertext_count(),
|
||||
output.lwe_ciphertext_count(),
|
||||
"Mismatched number of ciphertexts between input ({:?}) and output ({:?})",
|
||||
input.lwe_ciphertext_count(),
|
||||
output.lwe_ciphertext_count()
|
||||
);
|
||||
let num_samples = output.lwe_ciphertext_count().0 as u32;
|
||||
let lwe_dimension = &output.lwe_dimension();
|
||||
|
||||
stream.negate_lwe_ciphertext_vector_async(
|
||||
&mut output.0.d_vec,
|
||||
&input.0.d_vec,
|
||||
*lwe_dimension,
|
||||
num_samples,
|
||||
);
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until stream is synchronised
|
||||
pub unsafe fn cuda_lwe_ciphertext_negate_assign_async<Scalar>(
|
||||
ct: &mut CudaLweCiphertextList<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
let num_samples = ct.lwe_ciphertext_count().0 as u32;
|
||||
let lwe_dimension = &ct.lwe_dimension();
|
||||
|
||||
stream.negate_lwe_ciphertext_vector_assign_async(&mut ct.0.d_vec, *lwe_dimension, num_samples);
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until stream is synchronised
|
||||
pub unsafe fn cuda_lwe_ciphertext_cleartext_mul_async<Scalar>(
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
input: &CudaLweCiphertextList<Scalar>,
|
||||
cleartext: &CudaVec<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
assert_eq!(
|
||||
input.lwe_ciphertext_count(),
|
||||
output.lwe_ciphertext_count(),
|
||||
"Mismatched number of ciphertexts between input ({:?}) and output ({:?})",
|
||||
input.lwe_ciphertext_count(),
|
||||
output.lwe_ciphertext_count()
|
||||
);
|
||||
let num_samples = output.lwe_ciphertext_count().0 as u32;
|
||||
let lwe_dimension = &output.lwe_dimension();
|
||||
|
||||
stream.mult_lwe_ciphertext_vector_cleartext_vector(
|
||||
&mut output.0.d_vec,
|
||||
&input.0.d_vec,
|
||||
cleartext,
|
||||
*lwe_dimension,
|
||||
num_samples,
|
||||
);
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until stream is synchronised
|
||||
pub unsafe fn cuda_lwe_ciphertext_cleartext_mul_assign_async<Scalar>(
|
||||
ct: &mut CudaLweCiphertextList<Scalar>,
|
||||
cleartext: &CudaVec<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
let num_samples = ct.lwe_ciphertext_count().0 as u32;
|
||||
let lwe_dimension = ct.lwe_dimension();
|
||||
|
||||
stream.mult_lwe_ciphertext_vector_cleartext_vector_assign_async(
|
||||
&mut ct.0.d_vec,
|
||||
cleartext,
|
||||
lwe_dimension,
|
||||
num_samples,
|
||||
);
|
||||
}
|
||||
|
||||
pub fn cuda_lwe_ciphertext_add<Scalar>(
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
lhs: &CudaLweCiphertextList<Scalar>,
|
||||
rhs: &CudaLweCiphertextList<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
unsafe {
|
||||
cuda_lwe_ciphertext_add_async(output, lhs, rhs, stream);
|
||||
}
|
||||
stream.synchronize();
|
||||
}
|
||||
|
||||
pub fn cuda_lwe_ciphertext_add_assign<Scalar>(
|
||||
lhs: &mut CudaLweCiphertextList<Scalar>,
|
||||
rhs: &CudaLweCiphertextList<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
unsafe {
|
||||
cuda_lwe_ciphertext_add_assign_async(lhs, rhs, stream);
|
||||
}
|
||||
stream.synchronize();
|
||||
}
|
||||
|
||||
pub fn cuda_lwe_ciphertext_plaintext_add<Scalar>(
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
lhs: &CudaLweCiphertextList<Scalar>,
|
||||
rhs: &CudaVec<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
unsafe {
|
||||
cuda_lwe_ciphertext_plaintext_add_async(output, lhs, rhs, stream);
|
||||
}
|
||||
stream.synchronize();
|
||||
}
|
||||
|
||||
pub fn cuda_lwe_ciphertext_plaintext_add_assign<Scalar>(
|
||||
lhs: &mut CudaLweCiphertextList<Scalar>,
|
||||
rhs: &CudaVec<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
unsafe {
|
||||
cuda_lwe_ciphertext_plaintext_add_assign_async(lhs, rhs, stream);
|
||||
}
|
||||
stream.synchronize();
|
||||
}
|
||||
|
||||
pub fn cuda_lwe_ciphertext_negate<Scalar>(
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
input: &CudaLweCiphertextList<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
unsafe {
|
||||
cuda_lwe_ciphertext_negate_async(output, input, stream);
|
||||
}
|
||||
stream.synchronize();
|
||||
}
|
||||
|
||||
pub fn cuda_lwe_ciphertext_negate_assign<Scalar>(
|
||||
ct: &mut CudaLweCiphertextList<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
unsafe {
|
||||
cuda_lwe_ciphertext_negate_assign_async(ct, stream);
|
||||
}
|
||||
stream.synchronize();
|
||||
}
|
||||
|
||||
pub fn cuda_lwe_ciphertext_cleartext_mul<Scalar>(
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
input: &CudaLweCiphertextList<Scalar>,
|
||||
cleartext: &CudaVec<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
unsafe {
|
||||
cuda_lwe_ciphertext_cleartext_mul_async(output, input, cleartext, stream);
|
||||
}
|
||||
stream.synchronize();
|
||||
}
|
||||
|
||||
pub fn cuda_lwe_ciphertext_cleartext_mul_assign<Scalar>(
|
||||
ct: &mut CudaLweCiphertextList<Scalar>,
|
||||
cleartext: &CudaVec<Scalar>,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
Scalar: UnsignedInteger,
|
||||
{
|
||||
unsafe {
|
||||
cuda_lwe_ciphertext_cleartext_mul_assign_async(ct, cleartext, stream);
|
||||
}
|
||||
stream.synchronize();
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
use crate::core_crypto::gpu::entities::glwe_ciphertext_list::CudaGlweCiphertextList;
|
||||
use crate::core_crypto::gpu::entities::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::entities::lwe_multi_bit_bootstrap_key::CudaLweMultiBitBootstrapKey;
|
||||
use crate::core_crypto::gpu::vec::CudaVec;
|
||||
use crate::core_crypto::gpu::CudaStream;
|
||||
use crate::core_crypto::prelude::{CastInto, LweCiphertextIndex, UnsignedTorus};
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until stream is synchronised
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
|
||||
input: &CudaLweCiphertextList<Scalar>,
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
accumulator: &CudaGlweCiphertextList<Scalar>,
|
||||
lut_indexes: &CudaVec<Scalar>,
|
||||
output_indexes: &CudaVec<Scalar>,
|
||||
input_indexes: &CudaVec<Scalar>,
|
||||
multi_bit_bsk: &CudaLweMultiBitBootstrapKey,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
// CastInto required for PBS modulus switch which returns a usize
|
||||
Scalar: UnsignedTorus + CastInto<usize>,
|
||||
{
|
||||
assert_eq!(
|
||||
input.lwe_dimension(),
|
||||
multi_bit_bsk.input_lwe_dimension(),
|
||||
"Mimatched input LweDimension. LweCiphertext input LweDimension {:?}. \
|
||||
FourierLweMultiBitBootstrapKey input LweDimension {:?}.",
|
||||
input.lwe_dimension(),
|
||||
multi_bit_bsk.input_lwe_dimension(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
output.lwe_dimension(),
|
||||
multi_bit_bsk.output_lwe_dimension(),
|
||||
"Mimatched output LweDimension. LweCiphertext output LweDimension {:?}. \
|
||||
FourierLweMultiBitBootstrapKey output LweDimension {:?}.",
|
||||
output.lwe_dimension(),
|
||||
multi_bit_bsk.output_lwe_dimension(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
accumulator.glwe_dimension(),
|
||||
multi_bit_bsk.glwe_dimension(),
|
||||
"Mimatched GlweSize. Accumulator GlweSize {:?}. \
|
||||
FourierLweMultiBitBootstrapKey GlweSize {:?}.",
|
||||
accumulator.glwe_dimension(),
|
||||
multi_bit_bsk.glwe_dimension(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
accumulator.polynomial_size(),
|
||||
multi_bit_bsk.polynomial_size(),
|
||||
"Mimatched PolynomialSize. Accumulator PolynomialSize {:?}. \
|
||||
FourierLweMultiBitBootstrapKey PolynomialSize {:?}.",
|
||||
accumulator.polynomial_size(),
|
||||
multi_bit_bsk.polynomial_size(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
input.ciphertext_modulus(),
|
||||
output.ciphertext_modulus(),
|
||||
"Mismatched CiphertextModulus between input ({:?}) and output ({:?})",
|
||||
input.ciphertext_modulus(),
|
||||
output.ciphertext_modulus(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
input.ciphertext_modulus(),
|
||||
accumulator.ciphertext_modulus(),
|
||||
"Mismatched CiphertextModulus between input ({:?}) and accumulator ({:?})",
|
||||
input.ciphertext_modulus(),
|
||||
accumulator.ciphertext_modulus(),
|
||||
);
|
||||
|
||||
stream.bootstrap_multi_bit_async(
|
||||
&mut output.0.d_vec,
|
||||
output_indexes,
|
||||
&accumulator.0.d_vec,
|
||||
lut_indexes,
|
||||
&input.0.d_vec,
|
||||
input_indexes,
|
||||
&multi_bit_bsk.d_vec,
|
||||
input.lwe_dimension(),
|
||||
multi_bit_bsk.glwe_dimension(),
|
||||
multi_bit_bsk.polynomial_size(),
|
||||
multi_bit_bsk.decomp_base_log(),
|
||||
multi_bit_bsk.decomp_level_count(),
|
||||
multi_bit_bsk.grouping_factor(),
|
||||
input.lwe_ciphertext_count().0 as u32,
|
||||
LweCiphertextIndex(0),
|
||||
);
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
|
||||
input: &CudaLweCiphertextList<Scalar>,
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
accumulator: &CudaGlweCiphertextList<Scalar>,
|
||||
lut_indexes: &CudaVec<Scalar>,
|
||||
output_indexes: &CudaVec<Scalar>,
|
||||
input_indexes: &CudaVec<Scalar>,
|
||||
multi_bit_bsk: &CudaLweMultiBitBootstrapKey,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
// CastInto required for PBS modulus switch which returns a usize
|
||||
Scalar: UnsignedTorus + CastInto<usize>,
|
||||
{
|
||||
unsafe {
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async(
|
||||
input,
|
||||
output,
|
||||
accumulator,
|
||||
lut_indexes,
|
||||
output_indexes,
|
||||
input_indexes,
|
||||
multi_bit_bsk,
|
||||
stream,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
use crate::core_crypto::gpu::entities::glwe_ciphertext_list::CudaGlweCiphertextList;
|
||||
use crate::core_crypto::gpu::entities::lwe_bootstrap_key::CudaLweBootstrapKey;
|
||||
use crate::core_crypto::gpu::entities::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::vec::CudaVec;
|
||||
use crate::core_crypto::gpu::CudaStream;
|
||||
use crate::core_crypto::prelude::{
|
||||
CastInto, LweCiphertextCount, LweCiphertextIndex, UnsignedTorus,
|
||||
};
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until stream is synchronised
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
|
||||
input: &CudaLweCiphertextList<Scalar>,
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
accumulator: &CudaGlweCiphertextList<Scalar>,
|
||||
lut_indexes: &CudaVec<Scalar>,
|
||||
output_indexes: &CudaVec<Scalar>,
|
||||
input_indexes: &CudaVec<Scalar>,
|
||||
num_samples: LweCiphertextCount,
|
||||
bsk: &CudaLweBootstrapKey,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
// CastInto required for PBS modulus switch which returns a usize
|
||||
Scalar: UnsignedTorus + CastInto<usize>,
|
||||
{
|
||||
assert_eq!(input.ciphertext_modulus(), output.ciphertext_modulus());
|
||||
assert_eq!(
|
||||
output.ciphertext_modulus(),
|
||||
accumulator.ciphertext_modulus()
|
||||
);
|
||||
|
||||
stream.bootstrap_low_latency_async(
|
||||
&mut output.0.d_vec,
|
||||
output_indexes,
|
||||
&accumulator.0.d_vec,
|
||||
lut_indexes,
|
||||
&input.0.d_vec,
|
||||
input_indexes,
|
||||
&bsk.d_vec,
|
||||
input.lwe_dimension(),
|
||||
bsk.glwe_dimension(),
|
||||
bsk.polynomial_size(),
|
||||
bsk.decomp_base_log(),
|
||||
bsk.decomp_level_count(),
|
||||
num_samples.0 as u32,
|
||||
LweCiphertextIndex(0),
|
||||
);
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn cuda_programmable_bootstrap_lwe_ciphertext<Scalar>(
|
||||
input: &CudaLweCiphertextList<Scalar>,
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
accumulator: &CudaGlweCiphertextList<Scalar>,
|
||||
lut_indexes: &CudaVec<Scalar>,
|
||||
output_indexes: &CudaVec<Scalar>,
|
||||
input_indexes: &CudaVec<Scalar>,
|
||||
num_samples: LweCiphertextCount,
|
||||
bsk: &CudaLweBootstrapKey,
|
||||
stream: &CudaStream,
|
||||
) where
|
||||
// CastInto required for PBS modulus switch which returns a usize
|
||||
Scalar: UnsignedTorus + CastInto<usize>,
|
||||
{
|
||||
unsafe {
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_async(
|
||||
input,
|
||||
output,
|
||||
accumulator,
|
||||
lut_indexes,
|
||||
output_indexes,
|
||||
input_indexes,
|
||||
num_samples,
|
||||
bsk,
|
||||
stream,
|
||||
);
|
||||
}
|
||||
}
|
||||
12
tfhe/src/core_crypto/gpu/algorithms/mod.rs
Normal file
12
tfhe/src/core_crypto/gpu/algorithms/mod.rs
Normal file
@@ -0,0 +1,12 @@
|
||||
pub mod lwe_linear_algebra;
|
||||
pub mod lwe_multi_bit_programmable_bootstrapping;
|
||||
pub mod lwe_programmable_bootstrapping;
|
||||
|
||||
mod lwe_keyswitch;
|
||||
#[cfg(test)]
|
||||
mod test;
|
||||
|
||||
pub use lwe_keyswitch::*;
|
||||
pub use lwe_linear_algebra::*;
|
||||
pub use lwe_multi_bit_programmable_bootstrapping::*;
|
||||
pub use lwe_programmable_bootstrapping::*;
|
||||
123
tfhe/src/core_crypto/gpu/algorithms/test/lwe_keyswitch.rs
Normal file
123
tfhe/src/core_crypto/gpu/algorithms/test/lwe_keyswitch.rs
Normal file
@@ -0,0 +1,123 @@
|
||||
use super::*;
|
||||
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::lwe_keyswitch_key::CudaLweKeyswitchKey;
|
||||
use crate::core_crypto::gpu::{cuda_keyswitch_lwe_ciphertext, CudaDevice, CudaStream};
|
||||
use itertools::Itertools;
|
||||
|
||||
fn lwe_encrypt_ks_decrypt_custom_mod<Scalar: UnsignedTorus + CastFrom<usize>>(
|
||||
params: ClassicTestParams<Scalar>,
|
||||
) {
|
||||
let lwe_dimension = params.lwe_dimension;
|
||||
let lwe_modular_std_dev = params.lwe_modular_std_dev;
|
||||
let ciphertext_modulus = params.ciphertext_modulus;
|
||||
let message_modulus_log = params.message_modulus_log;
|
||||
let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
|
||||
let glwe_dimension = params.glwe_dimension;
|
||||
let polynomial_size = params.polynomial_size;
|
||||
let ks_decomp_base_log = params.ks_base_log;
|
||||
let ks_decomp_level_count = params.ks_level;
|
||||
|
||||
let gpu_index = 0;
|
||||
let device = CudaDevice::new(gpu_index);
|
||||
let stream = CudaStream::new_unchecked(device);
|
||||
|
||||
let mut rsc = TestResources::new();
|
||||
|
||||
const NB_TESTS: usize = 10;
|
||||
let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
|
||||
let mut msg = msg_modulus;
|
||||
let delta: Scalar = encoding_with_padding / msg_modulus;
|
||||
|
||||
while msg != Scalar::ZERO {
|
||||
msg = msg.wrapping_sub(Scalar::ONE);
|
||||
for _ in 0..NB_TESTS {
|
||||
let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key(
|
||||
lwe_dimension,
|
||||
&mut rsc.secret_random_generator,
|
||||
);
|
||||
|
||||
let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
|
||||
glwe_dimension,
|
||||
polynomial_size,
|
||||
&mut rsc.secret_random_generator,
|
||||
);
|
||||
|
||||
let big_lwe_sk = glwe_sk.into_lwe_secret_key();
|
||||
|
||||
let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
|
||||
&big_lwe_sk,
|
||||
&lwe_sk,
|
||||
ks_decomp_base_log,
|
||||
ks_decomp_level_count,
|
||||
lwe_modular_std_dev,
|
||||
ciphertext_modulus,
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&ksk_big_to_small,
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let d_ksk_big_to_small =
|
||||
CudaLweKeyswitchKey::from_lwe_keyswitch_key(&ksk_big_to_small, &stream);
|
||||
|
||||
let plaintext = Plaintext(msg * delta);
|
||||
|
||||
let ct = allocate_and_encrypt_new_lwe_ciphertext(
|
||||
&big_lwe_sk,
|
||||
plaintext,
|
||||
lwe_modular_std_dev,
|
||||
ciphertext_modulus,
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&ct,
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let d_ct = CudaLweCiphertextList::from_lwe_ciphertext(&ct, &stream);
|
||||
let mut d_output_ct = CudaLweCiphertextList::new(
|
||||
ksk_big_to_small.output_key_lwe_dimension(),
|
||||
LweCiphertextCount(1),
|
||||
ciphertext_modulus,
|
||||
&stream,
|
||||
);
|
||||
let num_blocks = d_ct.0.lwe_ciphertext_count.0;
|
||||
let lwe_indexes_usize = (0..num_blocks).collect_vec();
|
||||
let lwe_indexes = lwe_indexes_usize
|
||||
.iter()
|
||||
.map(|&x| <usize as CastInto<Scalar>>::cast_into(x))
|
||||
.collect_vec();
|
||||
let mut d_input_indexes = stream.malloc_async::<Scalar>(num_blocks as u32);
|
||||
let mut d_output_indexes = stream.malloc_async::<Scalar>(num_blocks as u32);
|
||||
stream.copy_to_gpu_async(&mut d_input_indexes, &lwe_indexes);
|
||||
stream.copy_to_gpu_async(&mut d_output_indexes, &lwe_indexes);
|
||||
|
||||
cuda_keyswitch_lwe_ciphertext(
|
||||
&d_ksk_big_to_small,
|
||||
&d_ct,
|
||||
&mut d_output_ct,
|
||||
&d_input_indexes,
|
||||
&d_output_indexes,
|
||||
&stream,
|
||||
);
|
||||
|
||||
let output_ct = d_output_ct.into_lwe_ciphertext(&stream);
|
||||
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&output_ct,
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let decrypted = decrypt_lwe_ciphertext(&lwe_sk, &output_ct);
|
||||
|
||||
let decoded = round_decode(decrypted.0, delta) % msg_modulus;
|
||||
|
||||
assert_eq!(msg, decoded);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
create_gpu_parametrized_test!(lwe_encrypt_ks_decrypt_custom_mod);
|
||||
@@ -0,0 +1,78 @@
|
||||
use super::*;
|
||||
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::{cuda_lwe_ciphertext_add_assign, CudaDevice, CudaStream};
|
||||
|
||||
fn lwe_encrypt_add_assign_decrypt_custom_mod<Scalar: UnsignedTorus>(
|
||||
params: ClassicTestParams<Scalar>,
|
||||
) {
|
||||
let lwe_dimension = params.lwe_dimension;
|
||||
let lwe_modular_std_dev = params.lwe_modular_std_dev;
|
||||
let ciphertext_modulus = params.ciphertext_modulus;
|
||||
let message_modulus_log = params.message_modulus_log;
|
||||
let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
|
||||
|
||||
let gpu_index = 0;
|
||||
let device = CudaDevice::new(gpu_index);
|
||||
let stream = CudaStream::new_unchecked(device);
|
||||
|
||||
let mut rsc = TestResources::new();
|
||||
|
||||
const NB_TESTS: usize = 10;
|
||||
let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
|
||||
let mut msg = msg_modulus;
|
||||
let delta: Scalar = encoding_with_padding / msg_modulus;
|
||||
|
||||
while msg != Scalar::ZERO {
|
||||
msg = msg.wrapping_sub(Scalar::ONE);
|
||||
for _ in 0..NB_TESTS {
|
||||
let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key(
|
||||
lwe_dimension,
|
||||
&mut rsc.secret_random_generator,
|
||||
);
|
||||
|
||||
let mut ct = LweCiphertext::new(
|
||||
Scalar::ZERO,
|
||||
lwe_dimension.to_lwe_size(),
|
||||
ciphertext_modulus,
|
||||
);
|
||||
|
||||
let plaintext = Plaintext(msg * delta);
|
||||
|
||||
encrypt_lwe_ciphertext(
|
||||
&lwe_sk,
|
||||
&mut ct,
|
||||
plaintext,
|
||||
lwe_modular_std_dev,
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&ct,
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let rhs = ct.clone();
|
||||
|
||||
// Convert to CUDA objects
|
||||
let mut d_ct = CudaLweCiphertextList::from_lwe_ciphertext(&ct, &stream);
|
||||
let d_rhs = CudaLweCiphertextList::from_lwe_ciphertext(&rhs, &stream);
|
||||
|
||||
cuda_lwe_ciphertext_add_assign(&mut d_ct, &d_rhs, &stream);
|
||||
|
||||
let output = d_ct.into_lwe_ciphertext(&stream);
|
||||
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&output,
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let decrypted = decrypt_lwe_ciphertext(&lwe_sk, &output);
|
||||
|
||||
let decoded = round_decode(decrypted.0, delta) % msg_modulus;
|
||||
|
||||
assert_eq!((msg + msg) % msg_modulus, decoded);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
create_gpu_parametrized_test!(lwe_encrypt_add_assign_decrypt_custom_mod);
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user