feat(gpu): add tfhe-cuda-backend to the repository

This commit is contained in:
Pedro Alves
2023-08-15 14:11:43 -03:00
committed by Agnès Leroy
parent f0e6b4c395
commit c632ac1b9a
137 changed files with 36138 additions and 55 deletions

View File

@@ -1,14 +1,101 @@
# Compile and test Concrete-cuda on an AWS instance
name: Concrete Cuda - Full tests
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
# All the inputs are provided by Slab
inputs:
instance_id:
description: "AWS instance ID"
type: string
instance_image_id:
description: "AWS instance AMI ID"
type: string
instance_type:
description: "AWS instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: 'Slab request ID'
type: string
fork_repo:
description: 'Name of forked repo as user/repo'
type: string
fork_git_sha:
description: 'Git SHA to checkout from fork'
type: string
jobs:
placeholder:
name: Placeholder
runs-on: ubuntu-latest
run-cuda-tests-linux:
concurrency:
group: tfhe_cuda_backend_test-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
name: Test code in EC2
runs-on: ${{ inputs.runner_name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- run: |
echo "Hello this is a Placeholder for GPU Workflow"
# Step used for log purpose.
- name: Instance configuration used
run: |
echo "ID: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
echo "Fork repo: ${{ inputs.fork_repo }}"
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
- name: Checkout tfhe-rs
uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9
with:
repository: ${{ inputs.fork_repo }}
ref: ${{ inputs.fork_git_sha }}
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
with:
toolchain: stable
default: true
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Run all tests
run: |
make clippy_gpu
make test_gpu

View File

@@ -0,0 +1,157 @@
# Run integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
name: Integer GPU benchmarks
on:
workflow_dispatch:
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
jobs:
run-integer-benchmarks:
name: Execute integer benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
with:
fetch-depth: 0
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
with:
toolchain: nightly
override: true
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Run benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
- name: Parse benchmarks to csv
run: |
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware ${{ inputs.instance_type }} \
--backend gpu \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -0,0 +1,154 @@
# Run all integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
name: Integer GPU full benchmarks
on:
workflow_dispatch:
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
jobs:
integer-benchmarks:
name: Execute integer benchmarks for all operations flavor
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
continue-on-error: true
strategy:
fail-fast: false
max-parallel: 1
matrix:
command: [ integer, integer_multi_bit]
op_flavor: [ default, unchecked ]
# explicit include-based build matrix, of known valid options
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
with:
fetch-depth: 0
- name: Get benchmark details
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
with:
toolchain: nightly
override: true
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Checkout Slab repo
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Run benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware ${{ inputs.instance_type }} \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
slack-notification:
name: Slack Notification
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ failure() }}
needs: integer-benchmarks
steps:
- name: Notify
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Integer GPU full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -0,0 +1,158 @@
# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
name: Integer Multi-bit benchmarks
on:
workflow_dispatch:
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
jobs:
run-integer-benchmarks:
name: Execute integer multi-bit benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "11.8"
cuda_arch: "70"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
with:
fetch-depth: 0
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
with:
toolchain: nightly
override: true
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Run multi-bit benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
- name: Parse benchmarks to csv
run: |
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware ${{ inputs.instance_type }} \
--backend gpu \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -49,6 +49,7 @@ jobs:
command: [ boolean_bench, shortint_bench,
integer_bench, integer_multi_bit_bench,
signed_integer_bench, signed_integer_multi_bit_bench,
integer_gpu_bench, integer_multi_bit_gpu_bench,
pbs_bench, wasm_client_bench ]
runs-on: ubuntu-latest
steps:

View File

@@ -24,8 +24,8 @@ jobs:
if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
strategy:
matrix:
command: [ boolean_bench, shortint_full_bench, integer_full_bench,
signed_integer_full_bench, pbs_bench, wasm_client_bench ]
command: [ boolean_bench, shortint_full_bench, integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
pbs_bench, wasm_client_bench ]
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs

View File

@@ -29,6 +29,7 @@ jobs:
allow-repeats: true
message: |
@slab-ci cpu_fast_test
@slab-ci gpu_test
- name: Add approved label
uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf

View File

@@ -53,6 +53,10 @@ endif
REGEX_STRING?=''
REGEX_PATTERN?=''
# tfhe-cuda-backend
TFHECUDA_SRC="backends/tfhe-cuda-backend/implementation"
TFHECUDA_BUILD=$(TFHECUDA_SRC)/build
# Exclude these files from coverage reports
define COVERAGE_EXCLUDED_FILES
--exclude-files apps/trivium/src/trivium/* \
@@ -137,10 +141,21 @@ check_linelint_installed:
fmt: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
.PHONY: fmt_gpu # Format rust and cuda code
fmt_gpu: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
cd backends/tfhe-cuda-backend/implementation/ && ./format_tfhe_cuda_backend.sh
.PHONY: check_fmt # Check rust code format
check_fmt: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
.PHONY: clippy_gpu # Run clippy lints on the gpu backend
clippy_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=$(TARGET_ARCH_FEATURE),integer,shortint,gpu \
-p tfhe -- --no-deps -D warnings
.PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
fix_newline: check_linelint_installed
linelint -a .
@@ -333,6 +348,23 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
-p $(TFHE_SPEC) -- core_crypto::; \
fi
.PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
test_gpu: test_core_crypto_gpu test_integer_gpu
.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- core_crypto::gpu::
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- core_crypto::gpu::
.PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- integer::gpu::server_key::
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- integer::gpu::server_key::
.PHONY: test_boolean # Run the tests of the boolean module
test_boolean: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -498,7 +530,7 @@ docs: doc
lint_doc: install_rs_check_toolchain
RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer --no-deps
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p tfhe --no-deps
.PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
lint_docs: lint_doc
@@ -577,6 +609,20 @@ bench_integer: install_rs_check_toolchain
--bench integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
.PHONY: bench_signed_integer # Run benchmarks for signed integer
bench_signed_integer: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-signed-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
.PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
bench_integer_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p tfhe --
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
bench_integer_multi_bit: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
@@ -585,13 +631,6 @@ bench_integer_multi_bit: install_rs_check_toolchain
--bench integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
.PHONY: bench_signed_integer # Run benchmarks for signed integer
bench_signed_integer: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-signed-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
.PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
bench_signed_integer_multi_bit: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
@@ -600,6 +639,14 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
--bench integer-signed-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
.PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
bench_integer_multi_bit_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p tfhe --
.PHONY: bench_shortint # Run benchmarks for shortint
bench_shortint: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
@@ -715,9 +762,12 @@ sha256_bool: install_rs_check_toolchain
--example sha256_bool \
--features=$(TARGET_ARCH_FEATURE),boolean
.PHONY: pcc # pcc stands for pre commit checks
.PHONY: pcc # pcc stands for pre commit checks (except GPU)
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_all check_compile_tests
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
pcc_gpu: pcc clippy_gpu
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_fast check_compile_tests

View File

@@ -0,0 +1,10 @@
# -----------------------------
# Options effecting formatting.
# -----------------------------
with section("format"):
# How wide to allow formatted cmake files
line_width = 120
# How many spaces to tab for indent
tab_size = 2

View File

@@ -0,0 +1,89 @@
cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
project(tfhe_cuda_backend LANGUAGES CXX CUDA)
# See if the minimum CUDA version is available. If not, only enable documentation building.
set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
include(CheckLanguage)
# See if CUDA is available
check_language(CUDA)
# If so, enable CUDA to check the version.
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
endif()
# If CUDA is not available, or the minimum version is too low do not build
if(NOT CMAKE_CUDA_COMPILER)
message(FATAL_ERROR "Cuda compiler not found.")
endif()
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS ${MINIMUM_SUPPORTED_CUDA_VERSION})
message(FATAL_ERROR "CUDA ${MINIMUM_SUPPORTED_CUDA_VERSION} or greater is required for compilation.")
endif()
# Get CUDA compute capability
set(OUTPUTFILE ${CMAKE_CURRENT_SOURCE_DIR}/cuda_script) # No suffix required
set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/check_cuda.cu)
execute_process(COMMAND nvcc -lcuda ${CUDAFILE} -o ${OUTPUTFILE})
execute_process(
COMMAND ${OUTPUTFILE}
RESULT_VARIABLE CUDA_RETURN_CODE
OUTPUT_VARIABLE ARCH)
file(REMOVE ${OUTPUTFILE})
if(${CUDA_RETURN_CODE} EQUAL 0)
set(CUDA_SUCCESS "TRUE")
else()
set(CUDA_SUCCESS "FALSE")
endif()
if(${CUDA_SUCCESS})
message(STATUS "CUDA Architecture: ${ARCH}")
message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
message(STATUS "CUDA Path: ${CUDA_TOOLKIT_ROOT_DIR}")
message(STATUS "CUDA Libraries: ${CUDA_LIBRARIES}")
message(STATUS "CUDA Performance Primitives: ${CUDA_npp_LIBRARY}")
else()
message(WARNING ${ARCH})
endif()
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
# Add OpenMP support
find_package(OpenMP REQUIRED)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler ${OpenMP_CXX_FLAGS}")
set(CMAKE_CUDA_ARCHITECTURES native)
if(NOT CUDA_NVCC_FLAGS)
set(CUDA_NVCC_FLAGS -arch=sm_70)
endif()
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
--use_fast_math -Xcompiler -fPIC")
set(INCLUDE_DIR include)
add_subdirectory(src)
target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
# This is required for rust cargo build
install(TARGETS tfhe_cuda_backend DESTINATION .)
install(TARGETS tfhe_cuda_backend DESTINATION lib)
# Define a function to add a lint target.
find_file(CPPLINT NAMES cpplint cpplint.exe)
if(CPPLINT)
# Add a custom target to lint all child projects. Dependencies are specified in child projects.
add_custom_target(all_lint)
# Don't trigger this target on ALL_BUILD or Visual Studio 'Rebuild Solution'
set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
# set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
endif()
enable_testing()

View File

@@ -0,0 +1,3 @@
set noparent
linelength=240
filter=-legal/copyright,-readability/todo,-runtime/references,-build/c++17

View File

@@ -0,0 +1,52 @@
# TFHE Cuda backend
## Introduction
The `tfhe-cuda-backend` holds the code for GPU acceleration of Zama's variant of TFHE.
It implements CUDA/C++ functions to perform homomorphic operations on LWE ciphertexts.
It provides functions to allocate memory on the GPU, to copy data back
and forth between the CPU and the GPU, to create and destroy Cuda streams, etc.:
- `cuda_create_stream`, `cuda_destroy_stream`
- `cuda_malloc`, `cuda_check_valid_malloc`
- `cuda_memcpy_async_to_cpu`, `cuda_memcpy_async_to_gpu`
- `cuda_get_number_of_gpus`
- `cuda_synchronize_device`
The cryptographic operations it provides are:
- an amortized implementation of the TFHE programmable bootstrap: `cuda_bootstrap_amortized_lwe_ciphertext_vector_32` and `cuda_bootstrap_amortized_lwe_ciphertext_vector_64`
- a low latency implementation of the TFHE programmable bootstrap: `cuda_bootstrap_low latency_lwe_ciphertext_vector_32` and `cuda_bootstrap_low_latency_lwe_ciphertext_vector_64`
- the keyswitch: `cuda_keyswitch_lwe_ciphertext_vector_32` and `cuda_keyswitch_lwe_ciphertext_vector_64`
- the larger precision programmable bootstrap (wop PBS, which supports up to 16 bits of message while the classical PBS only supports up to 8 bits of message) and its sub-components: `cuda_wop_pbs_64`, `cuda_extract_bits_64`, `cuda_circuit_bootstrap_64`, `cuda_cmux_tree_64`, `cuda_blind_rotation_sample_extraction_64`
- acceleration for leveled operations: `cuda_negate_lwe_ciphertext_vector_64`, `cuda_add_lwe_ciphertext_vector_64`, `cuda_add_lwe_ciphertext_vector_plaintext_vector_64`, `cuda_mult_lwe_ciphertext_vector_cleartext_vector`.
## Dependencies
**Disclaimer**: Compilation on Windows/Mac is not supported yet. Only Nvidia GPUs are supported.
- nvidia driver - for example, if you're running Ubuntu 20.04 check this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-20-04-focal-fossa-linux) for installation
- [nvcc](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) >= 10.0
- [gcc](https://gcc.gnu.org/) >= 8.0 - check this [page](https://gist.github.com/ax3l/9489132) for more details about nvcc/gcc compatible versions
- [cmake](https://cmake.org/) >= 3.24
## Build
The Cuda project held in `tfhe-cuda-backend` can be compiled independently from Concrete in the
following way:
```
git clone git@github.com:zama-ai/tfhe-rs
cd backends/tfhe-cuda-backend/implementation
mkdir build
cd build
cmake ..
make
```
The compute capability is detected automatically (with the first GPU information) and set accordingly.
## Links
- [TFHE](https://eprint.iacr.org/2018/421.pdf)
## License
This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
please contact us at `hello@zama.ai`.

View File

@@ -0,0 +1,22 @@
#include <stdio.h>
int main(int argc, char **argv) {
cudaDeviceProp dP;
float min_cc = 3.0;
int rc = cudaGetDeviceProperties(&dP, 0);
if (rc != cudaSuccess) {
cudaError_t error = cudaGetLastError();
printf("CUDA error: %s", cudaGetErrorString(error));
return rc; /* Failure */
}
if ((dP.major + (dP.minor / 10)) < min_cc) {
printf("Min Compute Capability of %2.1f required: %d.%d found\n Not "
"Building CUDA Code",
min_cc, dP.major, dP.minor);
return 1; /* Failure */
} else {
printf("-arch=sm_%d%d", dP.major, dP.minor);
return 0; /* Success */
}
}

View File

@@ -0,0 +1,6 @@
#!/bin/bash
find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
cmake-format -i CMakeLists.txt -c .cmake-format-config.py
find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'

View File

@@ -0,0 +1,118 @@
#ifndef CUDA_BOOTSTRAP_H
#define CUDA_BOOTSTRAP_H
#include "device.h"
#include <cstdint>
enum PBS_TYPE { MULTI_BIT = 0, LOW_LAT = 1, AMORTIZED = 2 };
extern "C" {
void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
cuda_stream_t *stream,
uint32_t polynomial_size,
uint32_t total_polynomials);
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
cuda_stream_t *stream,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
cuda_stream_t *stream,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void scratch_cuda_bootstrap_amortized_32(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory);
void scratch_cuda_bootstrap_amortized_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory);
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
int8_t **pbs_buffer);
void scratch_cuda_bootstrap_low_latency_32(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_bootstrap_low_latency_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
int8_t **pbs_buffer);
uint64_t get_buffer_size_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
uint64_t get_buffer_size_bootstrap_low_latency_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
}
#ifdef __CUDACC__
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template <typename T>
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t level_count);
template <typename T>
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t level_count);
template <typename T>
__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
#endif
#endif // CUDA_BOOTSTRAP_H

View File

@@ -0,0 +1,45 @@
#ifndef CUDA_MULTI_BIT_H
#define CUDA_MULTI_BIT_H
#include <cstdint>
extern "C" {
void cuda_convert_lwe_multi_bit_bootstrap_key_64(
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
uint32_t grouping_factor);
void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t chunk_size = 0);
void scratch_cuda_multi_bit_pbs_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t chunk_size = 0);
void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer);
}
#ifdef __CUDACC__
__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
uint32_t level_count,
uint32_t glwe_dimension,
uint32_t num_samples);
__host__ uint32_t get_average_lwe_chunk_size(uint32_t lwe_dimension,
uint32_t level_count,
uint32_t glwe_dimension);
__host__ uint64_t get_max_buffer_size_multibit_bootstrap(
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_input_lwe_ciphertext_count);
#endif
#endif // CUDA_MULTI_BIT_H

View File

@@ -0,0 +1,18 @@
#ifndef CUDA_CIPHERTEXT_H
#define CUDA_CIPHERTEXT_H
#include <cstdint>
extern "C" {
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
void *v_stream,
uint32_t gpu_index,
uint32_t number_of_cts,
uint32_t lwe_dimension);
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
void *v_stream,
uint32_t gpu_index,
uint32_t number_of_cts,
uint32_t lwe_dimension);
};
#endif

View File

@@ -0,0 +1,88 @@
#ifndef DEVICE_H
#define DEVICE_H
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cuda_runtime.h>
#define synchronize_threads_in_block() __syncthreads()
extern "C" {
struct cuda_stream_t {
cudaStream_t stream;
uint32_t gpu_index;
cuda_stream_t(uint32_t gpu_index) {
this->gpu_index = gpu_index;
cudaStreamCreate(&stream);
}
void release() {
cudaSetDevice(gpu_index);
cudaStreamDestroy(stream);
}
void synchronize() { cudaStreamSynchronize(stream); }
};
cuda_stream_t *cuda_create_stream(uint32_t gpu_index);
int cuda_destroy_stream(cuda_stream_t *stream);
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
int cuda_check_support_cooperative_groups();
int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size);
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream);
int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream);
int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size);
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cuda_stream_t *stream);
int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cuda_stream_t *stream);
int cuda_get_number_of_gpus();
int cuda_synchronize_device(uint32_t gpu_index);
int cuda_drop(void *ptr, uint32_t gpu_index);
int cuda_drop_async(void *ptr, cuda_stream_t *stream);
int cuda_get_max_shared_memory(uint32_t gpu_index);
int cuda_synchronize_stream(cuda_stream_t *stream);
#define check_cuda_error(ans) \
{ cuda_error((ans), __FILE__, __LINE__); }
inline void cuda_error(cudaError_t code, const char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort)
exit(code);
}
}
}
template <typename Torus>
void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
Torus n);
#endif

View File

@@ -0,0 +1,100 @@
#include "cuComplex.h"
#include "thrust/complex.h"
#include <iostream>
#include <string>
#include <type_traits>
#define PRINT_VARS
#ifdef PRINT_VARS
#define PRINT_DEBUG_5(var, begin, end, step, cond) \
_print_debug(var, #var, begin, end, step, cond, "", false)
#define PRINT_DEBUG_6(var, begin, end, step, cond, text) \
_print_debug(var, #var, begin, end, step, cond, text, true)
#define CAT(A, B) A##B
#define PRINT_SELECT(NAME, NUM) CAT(NAME##_, NUM)
#define GET_COUNT(_1, _2, _3, _4, _5, _6, COUNT, ...) COUNT
#define VA_SIZE(...) GET_COUNT(__VA_ARGS__, 6, 5, 4, 3, 2, 1)
#define PRINT_DEBUG(...) \
PRINT_SELECT(PRINT_DEBUG, VA_SIZE(__VA_ARGS__))(__VA_ARGS__)
#else
#define PRINT_DEBUG(...)
#endif
template <typename T>
__device__ typename std::enable_if<std::is_unsigned<T>::value, void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %u\n", var_name, i, var[i]);
}
}
__syncthreads();
}
template <typename T>
__device__ typename std::enable_if<std::is_signed<T>::value, void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %d\n", var_name, i, var[i]);
}
}
__syncthreads();
}
template <typename T>
__device__ typename std::enable_if<std::is_floating_point<T>::value, void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %.15f\n", var_name, i, var[i]);
}
}
__syncthreads();
}
template <typename T>
__device__
typename std::enable_if<std::is_same<T, thrust::complex<double>>::value,
void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].real(),
var[i].imag());
}
}
__syncthreads();
}
template <typename T>
__device__
typename std::enable_if<std::is_same<T, cuDoubleComplex>::value, void>::type
_print_debug(T *var, const char *var_name, int start, int end, int step,
bool cond, const char *text, bool has_text) {
__syncthreads();
if (cond) {
if (has_text)
printf("%s\n", text);
for (int i = start; i < end; i += step) {
printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].x, var[i].y);
}
}
__syncthreads();
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,21 @@
#ifndef CNCRT_KS_H_
#define CNCRT_KS_H_
#include <cstdint>
extern "C" {
void cuda_keyswitch_lwe_ciphertext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
void cuda_keyswitch_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
}
#endif // CNCRT_KS_H_

View File

@@ -0,0 +1,50 @@
#ifndef CUDA_LINALG_H_
#define CUDA_LINALG_H_
#include "bootstrap.h"
#include <cstdint>
#include <device.h>
extern "C" {
void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
}
#endif // CUDA_LINALG_H_

View File

@@ -0,0 +1,22 @@
set(SOURCES
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(
tfhe_cuda_backend
PROPERTIES CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
CUDA_ARCHITECTURES native)
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
target_include_directories(tfhe_cuda_backend PRIVATE .)

View File

@@ -0,0 +1 @@
#include "ciphertext.cuh"

View File

@@ -0,0 +1,44 @@
#ifndef CUDA_CIPHERTEXT_CUH
#define CUDA_CIPHERTEXT_CUH
#include "ciphertext.h"
#include "device.h"
#include <cstdint>
template <typename T>
void cuda_convert_lwe_ciphertext_vector_to_gpu(T *dest, T *src,
cuda_stream_t *stream,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cudaSetDevice(stream->gpu_index);
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
cuda_memcpy_async_to_gpu(dest, src, size, stream);
}
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
cuda_stream_t *stream,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
(uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
}
template <typename T>
void cuda_convert_lwe_ciphertext_vector_to_cpu(T *dest, T *src,
cuda_stream_t *stream,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cudaSetDevice(stream->gpu_index);
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
cuda_memcpy_async_to_cpu(dest, src, size, stream);
}
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
cuda_stream_t *stream,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
(uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
}
#endif

View File

@@ -0,0 +1,162 @@
#ifndef CNCRT_CRYPTO_CUH
#define CNCRT_CRPYTO_CUH
#include "device.h"
#include <cstdint>
/**
* GadgetMatrix implements the iterator design pattern to decompose a set of
* num_poly consecutive polynomials with degree params::degree. A total of
* level_count levels is expected and each call to decompose_and_compress_next()
* writes to the result the next level. It is also possible to advance an
* arbitrary amount of levels by using decompose_and_compress_level().
*
* This class always decomposes the entire set of num_poly polynomials.
* By default, it works on a single polynomial.
*/
#pragma once
template <typename T, class params> class GadgetMatrix {
private:
uint32_t level_count;
uint32_t base_log;
uint32_t mask;
uint32_t halfbg;
uint32_t num_poly;
T offset;
int current_level;
T mask_mod_b;
T *state;
public:
__device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
uint32_t num_poly = 1)
: base_log(base_log), level_count(level_count), num_poly(num_poly),
state(state) {
mask_mod_b = (1ll << base_log) - 1ll;
current_level = level_count;
int tid = threadIdx.x;
for (int i = 0; i < num_poly * params::opt; i++) {
state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
}
// Decomposes all polynomials at once
__device__ void decompose_and_compress_next(double2 *result) {
for (int j = 0; j < num_poly; j++) {
auto result_slice = result + j * params::degree / 2;
decompose_and_compress_next_polynomial(result_slice, j);
}
}
// Decomposes a single polynomial
__device__ void decompose_and_compress_next_polynomial(double2 *result,
int j) {
if (j == 0)
current_level -= 1;
int tid = threadIdx.x;
auto state_slice = state + j * params::degree;
for (int i = 0; i < params::opt / 2; i++) {
T res_re = state_slice[tid] & mask_mod_b;
T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
state_slice[tid] >>= base_log;
state_slice[tid + params::degree / 2] >>= base_log;
T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
T carry_im =
((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
carry_re >>= (base_log - 1);
carry_im >>= (base_log - 1);
state_slice[tid] += carry_re;
state_slice[tid + params::degree / 2] += carry_im;
res_re -= carry_re << base_log;
res_im -= carry_im << base_log;
result[tid].x = (int32_t)res_re;
result[tid].y = (int32_t)res_im;
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
}
// Decomposes a single polynomial
__device__ void
decompose_and_compress_next_polynomial_elements(double2 *result, int j) {
if (j == 0)
current_level -= 1;
int tid = threadIdx.x;
auto state_slice = state + j * params::degree;
for (int i = 0; i < params::opt / 2; i++) {
T res_re = state_slice[tid] & mask_mod_b;
T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
state_slice[tid] >>= base_log;
state_slice[tid + params::degree / 2] >>= base_log;
T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
T carry_im =
((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
carry_re >>= (base_log - 1);
carry_im >>= (base_log - 1);
state_slice[tid] += carry_re;
state_slice[tid + params::degree / 2] += carry_im;
res_re -= carry_re << base_log;
res_im -= carry_im << base_log;
result[i].x = (int32_t)res_re;
result[i].y = (int32_t)res_im;
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
}
__device__ void decompose_and_compress_level(double2 *result, int level) {
for (int i = 0; i < level_count - level; i++)
decompose_and_compress_next(result);
}
};
template <typename T> class GadgetMatrixSingle {
private:
uint32_t level_count;
uint32_t base_log;
uint32_t mask;
uint32_t halfbg;
T offset;
public:
__device__ GadgetMatrixSingle(uint32_t base_log, uint32_t level_count)
: base_log(base_log), level_count(level_count) {
uint32_t bg = 1 << base_log;
this->halfbg = bg / 2;
this->mask = bg - 1;
T temp = 0;
for (int i = 0; i < this->level_count; i++) {
temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
}
this->offset = temp * this->halfbg;
}
__device__ T decompose_one_level_single(T element, uint32_t level) {
T s = element + this->offset;
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
T temp1 = (s >> decal) & this->mask;
return (T)(temp1 - this->halfbg);
}
};
template <typename Torus>
__device__ Torus decompose_one(Torus &state, Torus mask_mod_b, int base_log) {
Torus res = state & mask_mod_b;
state >>= base_log;
Torus carry = ((res - 1ll) | state) & res;
carry >>= base_log - 1;
state += carry;
res -= carry << base_log;
return res;
}
#endif // CNCRT_CRPYTO_H

View File

@@ -0,0 +1,74 @@
#ifndef CNCRT_GGSW_CUH
#define CNCRT_GGSW_CUH
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "polynomial/parameters.cuh"
template <typename T, typename ST, class params, sharedMemDegree SMD>
__global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
int8_t *device_mem) {
extern __shared__ int8_t sharedmem[];
double2 *selected_memory;
if constexpr (SMD == FULLSM)
selected_memory = (double2 *)sharedmem;
else
selected_memory = (double2 *)device_mem[blockIdx.x * params::degree];
// Compression
int offset = blockIdx.x * blockDim.x;
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
ST x = src[(tid) + params::opt * offset];
ST y = src[(tid + params::degree / 2) + params::opt * offset];
selected_memory[tid].x = x / (double)std::numeric_limits<T>::max();
selected_memory[tid].y = y / (double)std::numeric_limits<T>::max();
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(selected_memory);
synchronize_threads_in_block();
// Write the output to global memory
tid = threadIdx.x;
#pragma unroll
for (int j = 0; j < params::opt / 2; j++) {
dest[tid + (params::opt >> 1) * offset] = selected_memory[tid];
tid += params::degree / params::opt;
}
}
/**
* Applies the FFT transform on sequence of GGSW ciphertexts already in the
* global memory
*/
template <typename T, typename ST, class params>
void batch_fft_ggsw_vector(cuda_stream_t *stream, double2 *dest, T *src,
int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
uint32_t polynomial_size, uint32_t level_count,
uint32_t gpu_index, uint32_t max_shared_memory) {
cudaSetDevice(stream->gpu_index);
int shared_memory_size = sizeof(double) * polynomial_size;
int gridSize = r * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
int blockSize = polynomial_size / params::opt;
if (max_shared_memory < shared_memory_size) {
device_batch_fft_ggsw_vector<T, ST, params, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(dest, src, d_mem);
} else {
device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(dest, src,
d_mem);
}
check_cuda_error(cudaGetLastError());
}
#endif // CNCRT_GGSW_CUH

View File

@@ -0,0 +1,48 @@
#include "keyswitch.cuh"
#include "keyswitch.h"
#include <cstdint>
/* Perform keyswitch on a batch of 32 bits input LWE ciphertexts.
* Head out to the equivalent operation on 64 bits for more details.
*/
void cuda_keyswitch_lwe_ciphertext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
}
/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
*
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - lwe_array_out: output batch of num_samples keyswitched ciphertexts c =
* (a0,..an-1,b) where n is the output LWE dimension (lwe_dimension_out)
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing
* lwe_dimension_in mask values + 1 body value
* - ksk: the keyswitch key to be used in the operation
* - base log: the log of the base used in the decomposition (should be the one
* used to create the ksk)
*
* This function calls a wrapper to a device kernel that performs the keyswitch
* - num_samples blocks of threads are launched
*/
void cuda_keyswitch_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
}

View File

@@ -0,0 +1,144 @@
#ifndef CNCRT_KS_CUH
#define CNCRT_KS_CUH
#include "device.h"
#include "gadget.cuh"
#include "polynomial/polynomial_math.cuh"
#include "torus.cuh"
#include <thread>
#include <vector>
template <typename Torus>
__device__ Torus *get_ith_block(Torus *ksk, int i, int level,
uint32_t lwe_dimension_out,
uint32_t level_count) {
int pos = i * level_count * (lwe_dimension_out + 1) +
level * (lwe_dimension_out + 1);
Torus *ptr = &ksk[pos];
return ptr;
}
/*
* keyswitch kernel
* Each thread handles a piece of the following equation:
* $$GLWE_s2(\Delta.m+e) = (0,0,..,0,b) - \sum_{i=0,k-1} <Dec(a_i),
* (GLWE_s2(s1_i q/beta),..,GLWE(s1_i q/beta^l)>$$ where k is the dimension of
* the GLWE ciphertext. If the polynomial dimension in GLWE is > 1, this
* equation is solved for each polynomial coefficient. where Dec denotes the
* decomposition with base beta and l levels and the inner product is done
* between the decomposition of a_i and l GLWE encryptions of s1_i q/\beta^j,
* with j in [1,l] We obtain a GLWE encryption of Delta.m (with Delta the
* scaling factor) under key s2 instead of s1, with an increased noise
*
*/
template <typename Torus>
__global__ void
keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
int lwe_lower, int lwe_upper, int cutoff) {
int tid = threadIdx.x;
extern __shared__ int8_t sharedmem[];
Torus *local_lwe_array_out = (Torus *)sharedmem;
auto block_lwe_array_in = get_chunk(
lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
auto block_lwe_array_out = get_chunk(
lwe_array_out, lwe_output_indexes[blockIdx.x], lwe_dimension_out + 1);
auto gadget = GadgetMatrixSingle<Torus>(base_log, level_count);
int lwe_part_per_thd;
if (tid < cutoff) {
lwe_part_per_thd = lwe_upper;
} else {
lwe_part_per_thd = lwe_lower;
}
__syncthreads();
for (int k = 0; k < lwe_part_per_thd; k++) {
int idx = tid + k * blockDim.x;
local_lwe_array_out[idx] = 0;
}
__syncthreads();
if (tid == 0) {
local_lwe_array_out[lwe_dimension_out] =
block_lwe_array_in[lwe_dimension_in];
}
for (int i = 0; i < lwe_dimension_in; i++) {
__syncthreads();
Torus a_i =
round_to_closest_multiple(block_lwe_array_in[i], base_log, level_count);
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
Torus mask_mod_b = (1ll << base_log) - 1ll;
for (int j = 0; j < level_count; j++) {
auto ksk_block = get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
for (int k = 0; k < lwe_part_per_thd; k++) {
int idx = tid + k * blockDim.x;
local_lwe_array_out[idx] -= (Torus)ksk_block[idx] * decomposed;
}
}
}
for (int k = 0; k < lwe_part_per_thd; k++) {
int idx = tid + k * blockDim.x;
block_lwe_array_out[idx] = local_lwe_array_out[idx];
}
}
/// assume lwe_array_in in the gpu
template <typename Torus>
__host__ void cuda_keyswitch_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
cudaSetDevice(stream->gpu_index);
constexpr int ideal_threads = 128;
int lwe_dim = lwe_dimension_out + 1;
int lwe_lower, lwe_upper, cutoff;
if (lwe_dim % ideal_threads == 0) {
lwe_lower = lwe_dim / ideal_threads;
lwe_upper = lwe_dim / ideal_threads;
cutoff = 0;
} else {
int y =
ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
cutoff = ideal_threads - y;
lwe_lower = lwe_dim / ideal_threads;
lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
}
int lwe_size_after = (lwe_dimension_out + 1) * num_samples;
int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1);
cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream);
check_cuda_error(cudaGetLastError());
dim3 grid(num_samples, 1, 1);
dim3 threads(ideal_threads, 1, 1);
// cudaFuncSetAttribute(keyswitch<Torus>,
// cudaFuncAttributeMaxDynamicSharedMemorySize,
// shared_mem);
keyswitch<<<grid, threads, shared_mem, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower,
lwe_upper, cutoff);
check_cuda_error(cudaGetLastError());
}
#endif

View File

@@ -0,0 +1,74 @@
#ifndef CNCRT_TORUS_CUH
#define CNCRT_TORUS_CUH
#include "types/int128.cuh"
#include <limits>
template <typename T>
__device__ inline void typecast_double_to_torus(double x, T &r) {
r = T(x);
}
template <>
__device__ inline void typecast_double_to_torus<uint32_t>(double x,
uint32_t &r) {
r = __double2uint_rn(x);
}
template <>
__device__ inline void typecast_double_to_torus<uint64_t>(double x,
uint64_t &r) {
// The ull intrinsic does not behave in the same way on all architectures and
// on some platforms this causes the cmux tree test to fail
// Hence the intrinsic is not used here
uint128 nnnn = make_uint128_from_float(x);
uint64_t lll = nnnn.lo_;
r = lll;
}
template <typename T>
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
uint32_t level_count) {
T shift = sizeof(T) * 8 - level_count * base_log;
T mask = 1ll << (shift - 1);
T b = (x & mask) >> (shift - 1);
T res = x >> shift;
res += b;
res <<= shift;
return res;
}
template <typename T>
__device__ __forceinline__ void rescale_torus_element(T element, T &output,
uint32_t log_shift) {
output =
round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
(double)log_shift);
}
template <typename T>
__device__ __forceinline__ T rescale_torus_element(T element,
uint32_t log_shift) {
return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
(double)log_shift);
}
template <>
__device__ __forceinline__ void
rescale_torus_element<uint32_t>(uint32_t element, uint32_t &output,
uint32_t log_shift) {
output =
round(__uint2double_rn(element) /
(__uint2double_rn(std::numeric_limits<uint32_t>::max()) + 1.0) *
__uint2double_rn(log_shift));
}
template <>
__device__ __forceinline__ void
rescale_torus_element<uint64_t>(uint64_t element, uint64_t &output,
uint32_t log_shift) {
output = round(__ull2double_rn(element) /
(__ull2double_rn(std::numeric_limits<uint64_t>::max()) + 1.0) *
__uint2double_rn(log_shift));
}
#endif // CNCRT_TORUS_H

View File

@@ -0,0 +1,350 @@
#include "device.h"
#include <cstdint>
#include <cuda_runtime.h>
/// Unsafe function to create a CUDA stream, must check first that GPU exists
cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
cudaSetDevice(gpu_index);
cuda_stream_t *stream = new cuda_stream_t(gpu_index);
return stream;
}
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
int cuda_destroy_stream(cuda_stream_t *stream) {
stream->release();
return 0;
}
/// Unsafe function that will try to allocate even if gpu_index is invalid
/// or if there's not enough memory. A safe wrapper around it must call
/// cuda_check_valid_malloc() first
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
cudaSetDevice(gpu_index);
void *ptr;
cudaMalloc((void **)&ptr, size);
check_cuda_error(cudaGetLastError());
return ptr;
}
/// Allocates a size-byte array at the device memory. Tries to do it
/// asynchronously.
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
cudaSetDevice(stream->gpu_index);
void *ptr;
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11020)
int support_async_alloc;
check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
cudaDevAttrMemoryPoolsSupported,
stream->gpu_index));
if (support_async_alloc) {
check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream->stream));
} else {
check_cuda_error(cudaMalloc((void **)&ptr, size));
}
#else
check_cuda_error(cudaMalloc((void **)&ptr, size));
#endif
return ptr;
}
/// Checks that allocation is valid
/// 0: valid
/// -1: invalid, not enough memory in device
/// -2: invalid, gpu index doesn't exist
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
size_t total_mem, free_mem;
cudaMemGetInfo(&free_mem, &total_mem);
if (size > free_mem) {
// error code: not enough memory
return -1;
}
return 0;
}
/// Returns
/// -> 0 if Cooperative Groups is not supported.
/// -> 1 otherwise
int cuda_check_support_cooperative_groups() {
int cooperative_groups_supported = 0;
cudaDeviceGetAttribute(&cooperative_groups_supported,
cudaDevAttrCooperativeLaunch, 0);
return cooperative_groups_supported > 0;
}
/// Tries to copy memory to the GPU asynchronously
/// 0: success
/// -1: error, invalid device pointer
/// -2: error, gpu index doesn't exist
/// -3: error, zero copy size
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream) {
if (size == 0) {
// error code: zero copy size
return -3;
}
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, dest);
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
cudaSetDevice(stream->gpu_index);
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream));
return 0;
}
/// Tries to copy memory to the GPU synchronously
/// 0: success
/// -1: error, invalid device pointer
/// -2: error, gpu index doesn't exist
/// -3: error, zero copy size
int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size) {
if (size == 0) {
// error code: zero copy size
return -3;
}
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, dest);
if (attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
return 0;
}
/// Tries to copy memory to the CPU synchronously
/// 0: success
/// -1: error, invalid device pointer
/// -2: error, gpu index doesn't exist
/// -3: error, zero copy size
int cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) {
if (size == 0) {
// error code: zero copy size
return -3;
}
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, src);
if (attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToHost));
return 0;
}
/// Tries to copy memory within a GPU asynchronously
/// 0: success
/// -1: error, invalid device pointer
/// -2: error, gpu index doesn't exist
/// -3: error, zero copy size
int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream) {
if (size == 0) {
// error code: zero copy size
return -3;
}
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaPointerAttributes attr_dest;
cudaPointerGetAttributes(&attr_dest, dest);
if (attr_dest.device != stream->gpu_index &&
attr_dest.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
cudaPointerAttributes attr_src;
cudaPointerGetAttributes(&attr_src, src);
if (attr_src.device != stream->gpu_index &&
attr_src.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
if (attr_src.device != attr_dest.device) {
// error code: different devices
return -1;
}
cudaSetDevice(stream->gpu_index);
check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice,
stream->stream));
return 0;
}
/// Synchronizes device
/// 0: success
/// -2: error, gpu index doesn't exist
int cuda_synchronize_device(uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
cudaDeviceSynchronize();
return 0;
}
int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cuda_stream_t *stream) {
if (size == 0) {
// error code: zero copy size
return -3;
}
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, dest);
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
cudaSetDevice(stream->gpu_index);
check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream));
return 0;
}
template <typename Torus>
__global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < n)
array[index] = value;
}
template <typename Torus>
void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
Torus n) {
int block_size = 256;
int num_blocks = (n + block_size - 1) / block_size;
// Launch the kernel
cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
n);
}
/// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array,
uint64_t value, uint64_t n);
template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,
uint32_t value, uint32_t n);
/// Tries to copy memory to the GPU asynchronously
/// 0: success
/// -1: error, invalid device pointer
/// -2: error, gpu index doesn't exist
/// -3: error, zero copy size
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cuda_stream_t *stream) {
if (size == 0) {
// error code: zero copy size
return -3;
}
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, src);
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
cudaSetDevice(stream->gpu_index);
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream));
return 0;
}
/// Return number of GPUs available
int cuda_get_number_of_gpus() {
int num_gpus;
cudaGetDeviceCount(&num_gpus);
return num_gpus;
}
/// Drop a cuda array
int cuda_drop(void *ptr, uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
check_cuda_error(cudaFree(ptr));
return 0;
}
/// Drop a cuda array. Tries to do it asynchronously
int cuda_drop_async(void *ptr, cuda_stream_t *stream) {
cudaSetDevice(stream->gpu_index);
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11020)
int support_async_alloc;
check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
cudaDevAttrMemoryPoolsSupported,
stream->gpu_index));
if (support_async_alloc) {
check_cuda_error(cudaFreeAsync(ptr, stream->stream));
} else {
check_cuda_error(cudaFree(ptr));
}
#else
check_cuda_error(cudaFree(ptr));
#endif
return 0;
}
/// Get the maximum size for the shared memory
int cuda_get_max_shared_memory(uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, gpu_index);
int max_shared_memory = 0;
if (prop.major >= 6) {
max_shared_memory = prop.sharedMemPerMultiprocessor;
} else {
max_shared_memory = prop.sharedMemPerBlock;
}
return max_shared_memory;
}
int cuda_synchronize_stream(cuda_stream_t *stream) {
stream->synchronize();
return 0;
}

View File

@@ -0,0 +1,725 @@
#ifndef GPU_BOOTSTRAP_FFT_CUH
#define GPU_BOOTSTRAP_FFT_CUH
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "twiddles.cuh"
#include "types/complex/operations.cuh"
/*
* Direct negacyclic FFT:
* - before the FFT the N real coefficients are stored into a
* N/2 sized complex with the even coefficients in the real part
* and the odd coefficients in the imaginary part. This is referred to
* as the half-size FFT
* - when calling BNSMFFT_direct for the forward negacyclic FFT of PBS,
* opt is divided by 2 because the butterfly pattern is always applied
* between pairs of coefficients
* - instead of twisting each coefficient A_j before the FFT by
* multiplying by the w^j roots of unity (aka twiddles, w=exp(-i pi /N)),
* the FFT is modified, and for each level k of the FFT the twiddle:
* w_j,k = exp(-i pi j/2^k)
* is replaced with:
* \zeta_j,k = exp(-i pi (2j-1)/2^k)
*/
template <class params> __device__ void NSMFFT_direct(double2 *A) {
/* We don't make bit reverse here, since twiddles are already reversed
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance
*/
size_t tid = threadIdx.x;
size_t twid_id;
size_t i1, i2;
double2 u, v, w;
// level 1
// we don't make actual complex multiplication on level1 since we have only
// one twiddle, it's real and image parts are equal, so we can multiply
// it with simpler operations
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
i1 = tid;
i2 = tid + params::degree / 2;
u = A[i1];
v = A[i2] * (double2){0.707106781186547461715008466854,
0.707106781186547461715008466854};
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 2
// from this level there are more than one twiddles and none of them has equal
// real and imag parts, so complete complex multiplication is needed
// for each level params::degree / 2^level represents number of coefficients
// inside divided chunk of specific level
//
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4);
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
i2 = i1 + params::degree / 4;
w = negtwiddles[twid_id + 2];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 3
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8);
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
i2 = i1 + params::degree / 8;
w = negtwiddles[twid_id + 4];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 4
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 16);
i1 =
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
i2 = i1 + params::degree / 16;
w = negtwiddles[twid_id + 8];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 5
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 32);
i1 =
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
i2 = i1 + params::degree / 32;
w = negtwiddles[twid_id + 16];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 6
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 64);
i1 =
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
i2 = i1 + params::degree / 64;
w = negtwiddles[twid_id + 32];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 7
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 128);
i1 = 2 * (params::degree / 128) * twid_id +
(tid & (params::degree / 128 - 1));
i2 = i1 + params::degree / 128;
w = negtwiddles[twid_id + 64];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// from level 8, we need to check size of params degree, because we support
// minimum actual polynomial size = 256, when compressed size is halfed and
// minimum supported compressed size is 128, so we always need first 7
// levels of butterfy operation, since butterfly levels are hardcoded
// we need to check if polynomial size is big enough to require specific level
// of butterfly.
if constexpr (params::degree >= 256) {
// level 8
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 256);
i1 = 2 * (params::degree / 256) * twid_id +
(tid & (params::degree / 256 - 1));
i2 = i1 + params::degree / 256;
w = negtwiddles[twid_id + 128];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 512) {
// level 9
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 512);
i1 = 2 * (params::degree / 512) * twid_id +
(tid & (params::degree / 512 - 1));
i2 = i1 + params::degree / 512;
w = negtwiddles[twid_id + 256];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 1024) {
// level 10
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 1024);
i1 = 2 * (params::degree / 1024) * twid_id +
(tid & (params::degree / 1024 - 1));
i2 = i1 + params::degree / 1024;
w = negtwiddles[twid_id + 512];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 2048) {
// level 11
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 2048);
i1 = 2 * (params::degree / 2048) * twid_id +
(tid & (params::degree / 2048 - 1));
i2 = i1 + params::degree / 2048;
w = negtwiddles[twid_id + 1024];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 4096) {
// level 12
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4096);
i1 = 2 * (params::degree / 4096) * twid_id +
(tid & (params::degree / 4096 - 1));
i2 = i1 + params::degree / 4096;
w = negtwiddles[twid_id + 2048];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
// compressed size = 8192 is actual polynomial size = 16384.
// from this size, twiddles can't fit in constant memory,
// so from here, butterfly operation access device memory.
if constexpr (params::degree >= 8192) {
// level 13
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8192);
i1 = 2 * (params::degree / 8192) * twid_id +
(tid & (params::degree / 8192 - 1));
i2 = i1 + params::degree / 8192;
w = negtwiddles13[twid_id];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
}
/*
* negacyclic inverse fft
*/
template <class params> __device__ void NSMFFT_inverse(double2 *A) {
/* We don't make bit reverse here, since twiddles are already reversed
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance
*/
size_t tid = threadIdx.x;
size_t twid_id;
size_t i1, i2;
double2 u, w;
// divide input by compressed polynomial size
tid = threadIdx.x;
for (size_t i = 0; i < params::opt; ++i) {
A[tid] /= params::degree;
tid += params::degree / params::opt;
}
__syncthreads();
// none of the twiddles have equal real and imag part, so
// complete complex multiplication has to be done
// here we have more than one twiddle
// mapping in backward fft is reversed
// butterfly operation is started from last level
// compressed size = 8192 is actual polynomial size = 16384.
// twiddles for this size can't fit in constant memory so
// butterfly operation for this level acess device memory to fetch
// twiddles
if constexpr (params::degree >= 8192) {
// level 13
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8192);
i1 = 2 * (params::degree / 8192) * twid_id +
(tid & (params::degree / 8192 - 1));
i2 = i1 + params::degree / 8192;
w = negtwiddles13[twid_id];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 4096) {
// level 12
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4096);
i1 = 2 * (params::degree / 4096) * twid_id +
(tid & (params::degree / 4096 - 1));
i2 = i1 + params::degree / 4096;
w = negtwiddles[twid_id + 2048];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 2048) {
// level 11
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 2048);
i1 = 2 * (params::degree / 2048) * twid_id +
(tid & (params::degree / 2048 - 1));
i2 = i1 + params::degree / 2048;
w = negtwiddles[twid_id + 1024];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 1024) {
// level 10
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 1024);
i1 = 2 * (params::degree / 1024) * twid_id +
(tid & (params::degree / 1024 - 1));
i2 = i1 + params::degree / 1024;
w = negtwiddles[twid_id + 512];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 512) {
// level 9
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 512);
i1 = 2 * (params::degree / 512) * twid_id +
(tid & (params::degree / 512 - 1));
i2 = i1 + params::degree / 512;
w = negtwiddles[twid_id + 256];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 256) {
// level 8
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 256);
i1 = 2 * (params::degree / 256) * twid_id +
(tid & (params::degree / 256 - 1));
i2 = i1 + params::degree / 256;
w = negtwiddles[twid_id + 128];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
// below level 8, we don't need to check size of params degree, because we
// support minimum actual polynomial size = 256, when compressed size is
// halfed and minimum supported compressed size is 128, so we always need
// last 7 levels of butterfy operation, since butterfly levels are hardcoded
// we don't need to check if polynomial size is big enough to require
// specific level of butterfly.
// level 7
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 128);
i1 = 2 * (params::degree / 128) * twid_id +
(tid & (params::degree / 128 - 1));
i2 = i1 + params::degree / 128;
w = negtwiddles[twid_id + 64];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 6
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 64);
i1 =
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
i2 = i1 + params::degree / 64;
w = negtwiddles[twid_id + 32];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 5
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 32);
i1 =
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
i2 = i1 + params::degree / 32;
w = negtwiddles[twid_id + 16];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 4
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 16);
i1 =
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
i2 = i1 + params::degree / 16;
w = negtwiddles[twid_id + 8];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 3
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8);
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
i2 = i1 + params::degree / 8;
w = negtwiddles[twid_id + 4];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 2
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4);
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
i2 = i1 + params::degree / 4;
w = negtwiddles[twid_id + 2];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 1
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 2);
i1 = 2 * (params::degree / 2) * twid_id + (tid & (params::degree / 2 - 1));
i2 = i1 + params::degree / 2;
w = negtwiddles[twid_id + 1];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
/*
* global batch fft
* does fft in half size
* unrolling half size fft result in half size + 1 elements
* this function must be called with actual degree
* function takes as input already compressed input
*/
template <class params, sharedMemDegree SMD>
__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
double2 *buffer) {
extern __shared__ double2 sharedMemoryFFT[];
double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
: sharedMemoryFFT;
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = d_input[blockIdx.x * (params::degree / 2) + tid];
tid = tid + params::degree / params::opt;
}
__syncthreads();
NSMFFT_direct<HalfDegree<params>>(fft);
__syncthreads();
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
tid = tid + params::degree / params::opt;
}
}
/*
* global batch polynomial multiplication
* only used for fft tests
* d_input1 and d_output must not have the same pointer
* d_input1 can be modified inside the function
*/
template <class params, sharedMemDegree SMD>
__global__ void batch_polynomial_mul(double2 *d_input1, double2 *d_input2,
double2 *d_output, double2 *buffer) {
extern __shared__ double2 sharedMemoryFFT[];
double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
: sharedMemoryFFT;
// Move first polynomial into shared memory(if possible otherwise it will
// be moved in device buffer)
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = d_input1[blockIdx.x * (params::degree / 2) + tid];
tid = tid + params::degree / params::opt;
}
// Perform direct negacyclic fourier transform
__syncthreads();
NSMFFT_direct<HalfDegree<params>>(fft);
__syncthreads();
// Put the result of direct fft inside input1
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
d_input1[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
tid = tid + params::degree / params::opt;
}
__syncthreads();
// Move first polynomial into shared memory(if possible otherwise it will
// be moved in device buffer)
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = d_input2[blockIdx.x * (params::degree / 2) + tid];
tid = tid + params::degree / params::opt;
}
// Perform direct negacyclic fourier transform on the second polynomial
__syncthreads();
NSMFFT_direct<HalfDegree<params>>(fft);
__syncthreads();
// calculate pointwise multiplication inside fft buffer
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] *= d_input1[blockIdx.x * (params::degree / 2) + tid];
tid = tid + params::degree / params::opt;
}
// Perform backward negacyclic fourier transform
__syncthreads();
NSMFFT_inverse<HalfDegree<params>>(fft);
__syncthreads();
// copy results in output buffer
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
tid = tid + params::degree / params::opt;
}
}
#endif // GPU_BOOTSTRAP_FFT_CUH

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,13 @@
#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
#define GPU_BOOTSTRAP_TWIDDLES_CUH
/*
* 'negtwiddles' are stored in constant memory for faster access times
* because of it's limitied size, only twiddles for up to 2^12 polynomial size
* can be stored there, twiddles for 2^13 are stored in device memory
* 'negtwiddles13'
*/
extern __constant__ double2 negtwiddles[4096];
extern __device__ double2 negtwiddles13[4096];
#endif

View File

@@ -0,0 +1,51 @@
#include "integer/bitwise_ops.cuh"
void scratch_cuda_integer_radix_bitop_kb_64(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
scratch_cuda_integer_radix_bitop_kb<uint64_t>(
stream, (int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count,
params, op_type, allocate_gpu_memory);
}
void cuda_bitop_integer_radix_ciphertext_kb_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t lwe_ciphertext_count) {
host_integer_radix_bitop_kb<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2),
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
lwe_ciphertext_count);
}
void cuda_bitnot_integer_radix_ciphertext_kb_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count) {
host_integer_radix_bitnot_kb<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
lwe_ciphertext_count);
}
void cleanup_cuda_integer_bitop(cuda_stream_t *stream, int8_t **mem_ptr_void) {
int_bitop_buffer<uint64_t> *mem_ptr =
(int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(stream);
}

View File

@@ -0,0 +1,51 @@
#ifndef CUDA_INTEGER_BITWISE_OPS_CUH
#define CUDA_INTEGER_BITWISE_OPS_CUH
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.cuh"
#include "integer.h"
#include "pbs/bootstrap_low_latency.cuh"
#include "pbs/bootstrap_multibit.cuh"
#include "polynomial/functions.cuh"
#include "utils/kernel_dimensions.cuh"
#include <omp.h>
template <typename Torus>
__host__ void
host_integer_radix_bitop_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_1, Torus *lwe_array_2,
int_bitop_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto lut = mem_ptr->lut;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, lwe_array_out, lwe_array_1, lwe_array_2, bsk, ksk,
num_radix_blocks, lut);
}
template <typename Torus>
__host__ void
host_integer_radix_bitnot_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_in,
int_bitop_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto lut = mem_ptr->lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, lwe_array_out, lwe_array_in, bsk, ksk, num_radix_blocks, lut);
}
template <typename Torus>
__host__ void scratch_cuda_integer_radix_bitop_kb(
cuda_stream_t *stream, int_bitop_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
bool allocate_gpu_memory) {
*mem_ptr = new int_bitop_buffer<Torus>(stream, op, params, num_radix_blocks,
allocate_gpu_memory);
}
#endif

View File

@@ -0,0 +1,45 @@
#include "integer/cmux.cuh"
void scratch_cuda_integer_radix_cmux_kb_64(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
std::function<uint64_t(uint64_t)> predicate_lut_f =
[](uint64_t x) -> uint64_t { return x == 1; };
scratch_cuda_integer_radix_cmux_kb(
stream, (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
lwe_ciphertext_count, params, allocate_gpu_memory);
}
void cuda_cmux_integer_radix_ciphertext_kb_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_condition,
void *lwe_array_true, void *lwe_array_false, int8_t *mem_ptr, void *bsk,
void *ksk, uint32_t lwe_ciphertext_count) {
host_integer_radix_cmux_kb<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_condition),
static_cast<uint64_t *>(lwe_array_true),
static_cast<uint64_t *>(lwe_array_false),
(int_cmux_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
lwe_ciphertext_count);
}
void cleanup_cuda_integer_radix_cmux(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_cmux_buffer<uint64_t> *mem_ptr =
(int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(stream);
}

View File

@@ -0,0 +1,100 @@
#ifndef CUDA_INTEGER_CMUX_CUH
#define CUDA_INTEGER_CMUX_CUH
#include "integer.cuh"
#include <omp.h>
template <typename Torus>
__host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_input, Torus *lwe_condition,
int_zero_out_if_buffer<Torus> *mem_ptr,
int_radix_lut<Torus> *predicate, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto params = mem_ptr->params;
int big_lwe_size = params.big_lwe_dimension + 1;
// Left message is shifted
int num_blocks = 0, num_threads = 0;
int num_entries = (params.big_lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
// We can't use integer_radix_apply_bivariate_lookup_table_kb since the
// second operand is fixed
auto tmp_lwe_array_input = mem_ptr->tmp;
for (int i = 0; i < num_radix_blocks; i++) {
auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0,
stream->stream>>>(
lwe_array_out_block, lwe_array_input_block, lwe_condition,
predicate->lwe_indexes, params.big_lwe_dimension,
params.message_modulus, 1);
check_cuda_error(cudaGetLastError());
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, lwe_array_out, tmp_lwe_array_input, bsk, ksk, num_radix_blocks,
predicate);
}
template <typename Torus>
__host__ void
host_integer_radix_cmux_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_condition, Torus *lwe_array_true,
Torus *lwe_array_false,
int_cmux_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto params = mem_ptr->params;
// Since our CPU threads will be working on different streams we shall assert
// the work in the main stream is completed
stream->synchronize();
auto true_stream = mem_ptr->zero_if_true_buffer->local_stream;
auto false_stream = mem_ptr->zero_if_false_buffer->local_stream;
#pragma omp parallel sections
{
// Both sections may be executed in parallel
#pragma omp section
{
auto mem_true = mem_ptr->zero_if_true_buffer;
zero_out_if(true_stream, mem_ptr->tmp_true_ct, lwe_array_true,
lwe_condition, mem_true, mem_ptr->inverted_predicate_lut, bsk,
ksk, num_radix_blocks);
}
#pragma omp section
{
auto mem_false = mem_ptr->zero_if_false_buffer;
zero_out_if(false_stream, mem_ptr->tmp_false_ct, lwe_array_false,
lwe_condition, mem_false, mem_ptr->predicate_lut, bsk, ksk,
num_radix_blocks);
}
}
cuda_synchronize_stream(true_stream);
cuda_synchronize_stream(false_stream);
// If the condition was true, true_ct will have kept its value and false_ct
// will be 0 If the condition was false, true_ct will be 0 and false_ct will
// have kept its value
auto added_cts = mem_ptr->tmp_true_ct;
host_addition(stream, added_cts, mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
params.big_lwe_dimension, num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, lwe_array_out, added_cts, bsk, ksk, num_radix_blocks,
mem_ptr->message_extract_lut);
}
template <typename Torus>
__host__ void scratch_cuda_integer_radix_cmux_kb(
cuda_stream_t *stream, int_cmux_buffer<Torus> **mem_ptr,
std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
int_radix_params params, bool allocate_gpu_memory) {
*mem_ptr = new int_cmux_buffer<Torus>(stream, predicate_lut_f, params,
num_radix_blocks, allocate_gpu_memory);
}
#endif

View File

@@ -0,0 +1,83 @@
#include "integer/comparison.cuh"
void scratch_cuda_integer_radix_comparison_kb_64(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, COMPARISON_TYPE op_type,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
switch (op_type) {
case EQ:
case NE:
scratch_cuda_integer_radix_equality_check_kb<uint64_t>(
stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
break;
case GT:
case GE:
case LT:
case LE:
case MAX:
case MIN:
scratch_cuda_integer_radix_difference_check_kb<uint64_t>(
stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
break;
}
}
void cuda_comparison_integer_radix_ciphertext_kb_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t lwe_ciphertext_count) {
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
switch (buffer->op) {
case EQ:
case NE:
host_integer_radix_equality_check_kb<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
break;
case GT:
case GE:
case LT:
case LE:
host_integer_radix_difference_check_kb<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2), buffer,
buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
lwe_ciphertext_count);
break;
case MAX:
case MIN:
host_integer_radix_maxmin_kb<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
break;
default:
printf("Not implemented\n");
}
}
void cleanup_cuda_integer_comparison(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_comparison_buffer<uint64_t> *mem_ptr =
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(stream);
}

View File

@@ -0,0 +1,468 @@
#ifndef CUDA_INTEGER_COMPARISON_OPS_CUH
#define CUDA_INTEGER_COMPARISON_OPS_CUH
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.cuh"
#include "integer.h"
#include "integer/cmux.cuh"
#include "integer/negation.cuh"
#include "integer/scalar_addition.cuh"
#include "pbs/bootstrap_low_latency.cuh"
#include "pbs/bootstrap_multibit.cuh"
#include "types/complex/operations.cuh"
#include "utils/kernel_dimensions.cuh"
// lwe_dimension + 1 threads
// todo: This kernel MUST be refactored to a binary reduction
template <typename Torus>
__global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
uint32_t lwe_dimension,
uint32_t num_blocks) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < lwe_dimension + 1) {
auto block = &input_block[idx];
Torus sum = block[0];
for (int i = 1; i < num_blocks; i++) {
sum += block[i * (lwe_dimension + 1)];
}
output[idx] = sum;
}
}
template <typename Torus>
__host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
Torus *input, uint32_t lwe_dimension,
uint32_t num_radix_blocks) {
int num_blocks = 0, num_threads = 0;
int num_entries = (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
// Add all blocks and store in sum
device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
output, input, lwe_dimension, num_radix_blocks);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
__host__ void
are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
auto are_all_block_true_buffer =
mem_ptr->eq_buffer->are_all_block_true_buffer;
uint32_t total_modulus = message_modulus * carry_modulus;
uint32_t max_value = total_modulus - 1;
cuda_memcpy_async_gpu_to_gpu(
lwe_array_out, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
int lut_num_blocks = 0;
uint32_t remaining_blocks = num_radix_blocks;
while (remaining_blocks > 1) {
// Split in max_value chunks
uint32_t chunk_length = std::min(max_value, remaining_blocks);
int num_chunks = remaining_blocks / chunk_length;
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
// as in the worst case we will be adding `max_value` ones
auto input_blocks = lwe_array_out;
auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
for (int i = 0; i < num_chunks; i++) {
accumulate_all_blocks(stream, accumulator, input_blocks,
big_lwe_dimension, chunk_length);
accumulator += (big_lwe_dimension + 1);
remaining_blocks -= (chunk_length - 1);
input_blocks += (big_lwe_dimension + 1) * chunk_length;
}
accumulator = are_all_block_true_buffer->tmp_block_accumulated;
// Selects a LUT
int_radix_lut<Torus> *lut;
if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
// is_non_zero_lut_buffer LUT
lut = mem_ptr->eq_buffer->is_non_zero_lut;
} else if (chunk_length == max_value) {
// is_max_value LUT
lut = are_all_block_true_buffer->is_max_value_lut;
} else {
// is_equal_to_num_blocks LUT
lut = are_all_block_true_buffer->is_equal_to_num_blocks_lut;
if (chunk_length != lut_num_blocks) {
auto is_equal_to_num_blocks_lut_f = [max_value,
chunk_length](Torus x) -> Torus {
return (x & max_value) == chunk_length;
};
generate_device_accumulator<Torus>(
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, is_equal_to_num_blocks_lut_f);
// We don't have to generate this lut again
lut_num_blocks = chunk_length;
}
}
// Applies the LUT
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, lwe_array_out, accumulator, bsk, ksk, num_chunks, lut);
}
}
// This takes an input slice of blocks.
//
// Each block can encrypt any value as long as its < message_modulus.
//
// It will compare blocks with 0, for either equality or difference.
//
// This returns a Vec of block, where each block encrypts 1 or 0
// depending of if all blocks matched with the comparison type with 0.
//
// E.g. For ZeroComparisonType::Equality, if all input blocks are zero
// than all returned block will encrypt 1
//
// The returned Vec will have less block than the number of input blocks.
// The returned blocks potentially needs to be 'reduced' to one block
// with eg are_all_comparisons_block_true.
//
// This function exists because sometimes it is faster to concatenate
// multiple vec of 'boolean' shortint block before reducing them with
// are_all_comparisons_block_true
template <typename Torus>
__host__ void host_compare_with_zero_equality(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
int32_t num_radix_blocks) {
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
// The idea is that we will sum chunks of blocks until carries are full
// then we compare the sum with 0.
//
// If all blocks were 0, the sum will be zero
// If at least one bock was not zero, the sum won't be zero
uint32_t total_modulus = message_modulus * carry_modulus;
uint32_t message_max = message_modulus - 1;
uint32_t num_elements_to_fill_carry = (total_modulus - 1) / message_max;
size_t big_lwe_size = big_lwe_dimension + 1;
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
int num_sum_blocks = 0;
// Accumulator
auto sum = lwe_array_out;
if (num_radix_blocks == 1) {
// Just copy
cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes, stream);
num_sum_blocks = 1;
} else {
uint32_t remainder_blocks = num_radix_blocks;
auto sum_i = sum;
auto chunk = lwe_array_in;
while (remainder_blocks > 1) {
uint32_t chunk_size =
std::min(remainder_blocks, num_elements_to_fill_carry);
accumulate_all_blocks(stream, sum_i, chunk, big_lwe_dimension,
chunk_size);
num_sum_blocks++;
remainder_blocks -= (chunk_size - 1);
// Update operands
chunk += chunk_size * big_lwe_size;
sum_i += big_lwe_size;
}
}
auto is_equal_to_zero_lut = mem_ptr->diff_buffer->is_zero_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, sum, sum, bsk, ksk, num_sum_blocks, is_equal_to_zero_lut);
are_all_comparisons_block_true(stream, lwe_array_out, sum, mem_ptr, bsk, ksk,
num_sum_blocks);
// The result will be in the two first block. Everything else is
// garbage.
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
big_lwe_size_bytes * (num_radix_blocks - 1), stream);
}
template <typename Torus>
__host__ void host_integer_radix_equality_check_kb(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
Torus *lwe_array_2, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto eq_buffer = mem_ptr->eq_buffer;
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
// Applies the LUT for the comparison operation
auto comparisons = mem_ptr->tmp_block_comparisons;
integer_radix_apply_bivariate_lookup_table_kb(
stream, comparisons, lwe_array_1, lwe_array_2, bsk, ksk, num_radix_blocks,
eq_buffer->operator_lut);
// This takes a Vec of blocks, where each block is either 0 or 1.
//
// It return a block encrypting 1 if all input blocks are 1
// otherwise the block encrypts 0
are_all_comparisons_block_true(stream, lwe_array_out, comparisons, mem_ptr,
bsk, ksk, num_radix_blocks);
// Zero all blocks but the first
size_t big_lwe_size = big_lwe_dimension + 1;
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
big_lwe_size_bytes * (num_radix_blocks - 1), stream);
}
template <typename Torus>
__host__ void scratch_cuda_integer_radix_equality_check_kb(
cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
bool allocate_gpu_memory) {
*mem_ptr = new int_comparison_buffer<Torus>(
stream, op, params, num_radix_blocks, allocate_gpu_memory);
}
template <typename Torus>
__host__ void
compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_left, Torus *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
// When rhs > lhs, the subtraction will overflow, and the bit of padding will
// be set to 1
// meaning that the output of the pbs will be the negative (modulo message
// space)
//
// Example:
// lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
// lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
// Since there was an overflow the bit of padding is 1 and not 0.
// When applying the LUT for an input value of 14 we would expect 1,
// but since the bit of padding is 1, we will get -1 modulus our message
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
// Subtract
// Here we need the true lwe sub, not the one that comes from shortint.
host_subtraction(stream, lwe_array_out, lwe_array_left, lwe_array_right,
big_lwe_dimension, num_radix_blocks);
// Apply LUT to compare to 0
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb(
stream, lwe_array_out, lwe_array_out, bsk, ksk, num_radix_blocks,
is_non_zero_lut);
// Add one
// Here Lhs can have the following values: (-1) % (message modulus * carry
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
big_lwe_dimension, num_radix_blocks,
message_modulus, carry_modulus);
}
// Reduces a vec containing shortint blocks that encrypts a sign
// (inferior, equal, superior) to one single shortint block containing the
// final sign
template <typename Torus>
__host__ void
tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_block_comparisons,
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
std::function<Torus(Torus)> sign_handler_f, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto params = tree_buffer->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
// Tree reduction
// Reduces a vec containing shortint blocks that encrypts a sign
// (inferior, equal, superior) to one single shortint block containing the
// final sign
size_t big_lwe_size = big_lwe_dimension + 1;
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
auto x = tree_buffer->tmp_x;
auto y = tree_buffer->tmp_y;
if (x != lwe_block_comparisons)
cuda_memcpy_async_gpu_to_gpu(x, lwe_block_comparisons,
big_lwe_size_bytes * num_radix_blocks, stream);
uint32_t partial_block_count = num_radix_blocks;
auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
while (partial_block_count > 2) {
pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, x, y, bsk, ksk, partial_block_count >> 1, inner_tree_leaf);
if ((partial_block_count % 2) != 0) {
partial_block_count >>= 1;
partial_block_count++;
auto last_y_block = y + (partial_block_count - 1) * big_lwe_size;
auto last_x_block = x + (partial_block_count - 1) * big_lwe_size;
cuda_memcpy_async_gpu_to_gpu(last_x_block, last_y_block,
big_lwe_size_bytes, stream);
} else {
partial_block_count >>= 1;
}
}
auto last_lut = tree_buffer->tree_last_leaf_lut;
auto block_selector_f = tree_buffer->block_selector_f;
std::function<Torus(Torus)> f;
if (partial_block_count == 2) {
pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
int msb = (x >> 2) & 3;
int lsb = x & 3;
int final_sign = block_selector_f(msb, lsb);
return sign_handler_f(final_sign);
};
} else {
// partial_block_count == 1
y = x;
f = sign_handler_f;
}
generate_device_accumulator<Torus>(stream, last_lut->lut, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, f);
// Last leaf
integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out, y, bsk,
ksk, 1, last_lut);
}
template <typename Torus>
__host__ void host_integer_radix_difference_check_kb(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_left,
Torus *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> reduction_lut_f, void *bsk, Torus *ksk,
uint32_t total_num_radix_blocks) {
auto diff_buffer = mem_ptr->diff_buffer;
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
uint32_t num_radix_blocks = total_num_radix_blocks;
auto lhs = lwe_array_left;
auto rhs = lwe_array_right;
if (carry_modulus == message_modulus) {
// Packing is possible
// Pack inputs
Torus *packed_left = diff_buffer->tmp_packed_left;
Torus *packed_right = diff_buffer->tmp_packed_right;
pack_blocks(stream, packed_left, lwe_array_left, big_lwe_dimension,
num_radix_blocks, message_modulus);
pack_blocks(stream, packed_right, lwe_array_right, big_lwe_dimension,
num_radix_blocks, message_modulus);
// From this point we have half number of blocks
num_radix_blocks /= 2;
// Clean noise
auto cleaning_lut = mem_ptr->cleaning_lut;
integer_radix_apply_univariate_lookup_table_kb(
stream, packed_left, packed_left, bsk, ksk, num_radix_blocks,
cleaning_lut);
integer_radix_apply_univariate_lookup_table_kb(
stream, packed_right, packed_right, bsk, ksk, num_radix_blocks,
cleaning_lut);
lhs = packed_left;
rhs = packed_right;
}
// comparisons will be assigned
// - 0 if lhs < rhs
// - 1 if lhs == rhs
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_block_comparisons;
compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
num_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(stream, lwe_array_out, comparisons,
mem_ptr->diff_buffer->tree_buffer, reduction_lut_f, bsk,
ksk, num_radix_blocks);
// The result will be in the first block. Everything else is garbage.
size_t big_lwe_size = big_lwe_dimension + 1;
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
(total_num_radix_blocks - 1) * big_lwe_size_bytes, stream);
}
template <typename Torus>
__host__ void scratch_cuda_integer_radix_difference_check_kb(
cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
bool allocate_gpu_memory) {
*mem_ptr = new int_comparison_buffer<Torus>(
stream, op, params, num_radix_blocks, allocate_gpu_memory);
}
template <typename Torus>
__host__ void
host_integer_radix_maxmin_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_left, Torus *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t total_num_radix_blocks) {
// Compute the sign
host_integer_radix_difference_check_kb(
stream, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
mem_ptr, mem_ptr->cleaning_lut_f, bsk, ksk, total_num_radix_blocks);
// Selector
host_integer_radix_cmux_kb(
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
}
#endif

View File

@@ -0,0 +1,127 @@
#include "integer/integer.cuh"
#include <linear_algebra.h>
void cuda_full_propagation_64_inplace(
cuda_stream_t *stream, void *input_blocks, int8_t *mem_ptr, void *ksk,
void *bsk, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_base_log, uint32_t ks_level,
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t grouping_factor,
uint32_t num_blocks) {
switch (polynomial_size) {
case 256:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<256>>(
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 512:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<512>>(
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 1024:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<1024>>(
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 2048:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<2048>>(
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 4096:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<4096>>(
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 8192:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<8192>>(
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 16384:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<16384>>(
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
default:
break;
}
}
void scratch_cuda_full_propagation_64(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
scratch_cuda_full_propagation<uint64_t>(
stream, (int_fullprop_buffer<uint64_t> **)mem_ptr, lwe_dimension,
glwe_dimension, polynomial_size, level_count, grouping_factor,
input_lwe_ciphertext_count, message_modulus, carry_modulus, pbs_type,
allocate_gpu_memory);
}
void cleanup_cuda_full_propagation(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_fullprop_buffer<uint64_t> *mem_ptr =
(int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);
cuda_drop_async(mem_ptr->lut_buffer, stream);
cuda_drop_async(mem_ptr->lut_indexes, stream);
cuda_drop_async(mem_ptr->pbs_buffer, stream);
cuda_drop_async(mem_ptr->tmp_small_lwe_vector, stream);
cuda_drop_async(mem_ptr->tmp_big_lwe_vector, stream);
}
void scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
scratch_cuda_propagate_single_carry_low_latency_kb_inplace(
stream, (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
}
void cuda_propagate_single_carry_low_latency_kb_64_inplace(
cuda_stream_t *stream, void *lwe_array, int8_t *mem_ptr, void *bsk,
void *ksk, uint32_t num_blocks) {
host_propagate_single_carry_low_latency<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array),
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsk,
static_cast<uint64_t *>(ksk), num_blocks);
}
void cleanup_cuda_propagate_single_carry_low_latency(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_sc_prop_memory<uint64_t> *mem_ptr =
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(stream);
}

View File

@@ -0,0 +1,675 @@
#ifndef CUDA_INTEGER_CUH
#define CUDA_INTEGER_CUH
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.h"
#include "integer/scalar_addition.cuh"
#include "linear_algebra.h"
#include "linearalgebra/addition.cuh"
#include "pbs/bootstrap_low_latency.cuh"
#include "pbs/bootstrap_multibit.cuh"
#include "polynomial/functions.cuh"
#include "utils/kernel_dimensions.cuh"
#include <functional>
template <typename Torus>
void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory, PBS_TYPE pbs_type) {
if (sizeof(Torus) == sizeof(uint32_t)) {
// 32 bits
switch (pbs_type) {
case MULTI_BIT:
printf("multibit\n");
printf("Error: 32-bit multibit PBS is not supported.\n");
break;
case LOW_LAT:
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
case AMORTIZED:
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
default:
break;
}
} else {
// 64 bits
switch (pbs_type) {
case MULTI_BIT:
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, base_log, level_count,
input_lwe_ciphertext_count, num_lut_vectors, lwe_idx,
max_shared_memory);
break;
case LOW_LAT:
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
case AMORTIZED:
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
default:
break;
}
}
}
// function rotates right radix ciphertext with specific value
// grid is one dimensional
// blockIdx.x represents x_th block of radix ciphertext
template <typename Torus>
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src, uint32_t value,
uint32_t blocks_count, uint32_t lwe_size) {
value %= blocks_count;
size_t tid = threadIdx.x;
size_t src_block_id = blockIdx.x;
size_t dst_block_id = (src_block_id + value) % blocks_count;
size_t stride = blockDim.x;
auto cur_src_block = &src[src_block_id * lwe_size];
auto cur_dst_block = &dst[dst_block_id * lwe_size];
for (size_t i = tid; i < lwe_size; i += stride) {
cur_dst_block[i] = cur_src_block[i];
}
}
// function rotates left radix ciphertext with specific value
// grid is one dimensional
// blockIdx.x represents x_th block of radix ciphertext
template <typename Torus>
__global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value,
uint32_t blocks_count, uint32_t lwe_size) {
value %= blocks_count;
size_t src_block_id = blockIdx.x;
size_t tid = threadIdx.x;
size_t dst_block_id = (src_block_id >= value)
? src_block_id - value
: src_block_id - value + blocks_count;
size_t stride = blockDim.x;
auto cur_src_block = &src[src_block_id * lwe_size];
auto cur_dst_block = &dst[dst_block_id * lwe_size];
for (size_t i = tid; i < lwe_size; i += stride) {
cur_dst_block[i] = cur_src_block[i];
}
}
// polynomial_size threads
template <typename Torus>
__global__ void
device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_array_1,
Torus *lwe_array_2, Torus *lwe_indexes,
uint32_t lwe_dimension, uint32_t message_modulus,
uint32_t num_blocks) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < num_blocks * (lwe_dimension + 1)) {
int block_id = tid / (lwe_dimension + 1);
int coeff_id = tid % (lwe_dimension + 1);
int pos = lwe_indexes[block_id] * (lwe_dimension + 1) + coeff_id;
lwe_array_out[pos] = lwe_array_1[pos] * message_modulus + lwe_array_2[pos];
}
}
template <typename Torus>
__host__ void pack_bivariate_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_1, Torus *lwe_array_2,
Torus *lwe_indexes, uint32_t lwe_dimension,
uint32_t message_modulus,
uint32_t num_radix_blocks) {
// Left message is shifted
int num_blocks = 0, num_threads = 0;
int num_entries = num_radix_blocks * (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
lwe_array_out, lwe_array_1, lwe_array_2, lwe_indexes, lwe_dimension,
message_modulus, num_radix_blocks);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
__host__ void integer_radix_apply_univariate_lookup_table_kb(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in, void *bsk,
Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
// apply_lookup_table
auto params = lut->params;
auto pbs_type = params.pbs_type;
auto big_lwe_dimension = params.big_lwe_dimension;
auto small_lwe_dimension = params.small_lwe_dimension;
auto ks_level = params.ks_level;
auto ks_base_log = params.ks_base_log;
auto pbs_level = params.pbs_level;
auto pbs_base_log = params.pbs_base_log;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto grouping_factor = params.grouping_factor;
// Compute Keyswitch-PBS
cuda_keyswitch_lwe_ciphertext_vector(
stream, lut->tmp_lwe_after_ks, lut->lwe_indexes, lwe_array_in,
lut->lwe_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
ks_base_log, ks_level, num_radix_blocks);
execute_pbs(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes, bsk,
lut->pbs_buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
}
template <typename Torus>
__host__ void integer_radix_apply_bivariate_lookup_table_kb(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
Torus *lwe_array_2, void *bsk, Torus *ksk, uint32_t num_radix_blocks,
int_radix_lut<Torus> *lut) {
// apply_lookup_table_bivariate
auto params = lut->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
// Left message is shifted
pack_bivariate_blocks(stream, lut->tmp_lwe_before_ks, lwe_array_1,
lwe_array_2, lut->lwe_indexes, big_lwe_dimension,
message_modulus, num_radix_blocks);
check_cuda_error(cudaGetLastError());
// Apply LUT
integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out,
lut->tmp_lwe_before_ks, bsk,
ksk, num_radix_blocks, lut);
}
// Rotates the slice in-place such that the first mid elements of the slice move
// to the end while the last array_length elements move to the front. After
// calling rotate_left, the element previously at index mid will become the
// first element in the slice.
template <typename Torus>
void rotate_left(Torus *buffer, int mid, uint32_t array_length) {
mid = mid % array_length;
std::rotate(buffer, buffer + mid, buffer + array_length);
}
template <typename Torus>
void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus,
uint32_t carry_modulus,
std::function<Torus(Torus)> f) {
uint32_t modulus_sup = message_modulus * carry_modulus;
uint32_t box_size = polynomial_size / modulus_sup;
Torus delta = (1ul << 63) / modulus_sup;
memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));
auto body = &acc[glwe_dimension * polynomial_size];
// This accumulator extracts the carry bits
for (int i = 0; i < modulus_sup; i++) {
int index = i * box_size;
for (int j = index; j < index + box_size; j++) {
auto f_eval = f(i);
body[j] = f_eval * delta;
}
}
int half_box_size = box_size / 2;
// Negate the first half_box_size coefficients
for (int i = 0; i < half_box_size; i++) {
body[i] = -body[i];
}
rotate_left(body, half_box_size, polynomial_size);
}
template <typename Torus>
void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t message_modulus,
uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f) {
Torus factor_u64 = message_modulus;
auto wrapped_f = [factor_u64, message_modulus, f](Torus input) -> Torus {
Torus lhs = (input / factor_u64) % message_modulus;
Torus rhs = (input % factor_u64) % message_modulus;
return f(lhs, rhs);
};
generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, wrapped_f);
}
/*
* generate bivariate accumulator for device pointer
* v_stream - cuda stream
* acc - device pointer for bivariate accumulator
* ...
* f - wrapping function with two Torus inputs
*/
template <typename Torus>
void generate_device_accumulator_bivariate(
cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f) {
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
// fill bivariate accumulator
generate_lookup_table_bivariate<Torus>(h_lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f);
// copy host lut and tvi to device
cuda_memcpy_async_to_gpu(
acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
cuda_synchronize_stream(stream);
free(h_lut);
}
/*
* generate bivariate accumulator for device pointer
* v_stream - cuda stream
* acc - device pointer for accumulator
* ...
* f - evaluating function with one Torus input
*/
template <typename Torus>
void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t message_modulus,
uint32_t carry_modulus,
std::function<Torus(Torus)> f) {
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
// fill accumulator
generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f);
// copy host lut and tvi to device
cuda_memcpy_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream);
cuda_synchronize_stream(stream);
free(h_lut);
}
template <typename Torus>
void scratch_cuda_propagate_single_carry_low_latency_kb_inplace(
cuda_stream_t *stream, int_sc_prop_memory<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
*mem_ptr = new int_sc_prop_memory<Torus>(stream, params, num_radix_blocks,
allocate_gpu_memory);
}
template <typename Torus>
void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
Torus *lwe_array,
int_sc_prop_memory<Torus> *mem,
void *bsk, Torus *ksk,
uint32_t num_blocks) {
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto big_lwe_size = glwe_dimension * polynomial_size + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
auto generates_or_propagates = mem->generates_or_propagates;
auto step_output = mem->step_output;
auto test_vector_array = mem->test_vector_array;
auto lut_carry_propagation_sum = mem->lut_carry_propagation_sum;
auto message_acc = mem->message_acc;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
test_vector_array);
// compute prefix sum with hillis&steele
int num_steps = ceil(log2((double)num_blocks));
int space = 1;
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
big_lwe_size_bytes * num_blocks, stream);
for (int step = 0; step < num_steps; step++) {
auto cur_blocks = &step_output[space * big_lwe_size];
auto prev_blocks = generates_or_propagates;
int cur_total_blocks = num_blocks - space;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
lut_carry_propagation_sum);
cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
cur_blocks,
big_lwe_size_bytes * cur_total_blocks, stream);
space *= 2;
}
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
step_output, generates_or_propagates, 1, num_blocks, big_lwe_size);
cuda_memset_async(step_output, 0, big_lwe_size_bytes, stream);
host_addition(stream, lwe_array, lwe_array, step_output,
glwe_dimension * polynomial_size, num_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, lwe_array, lwe_array, bsk, ksk, num_blocks, message_acc);
}
/*
* input_blocks: input radix ciphertext propagation will happen inplace
* acc_message_carry: list of two lut s, [(message_acc), (carry_acc)]
* tvi_message_carry: tvi for message and carry, should always be {0, 1}
* small_lwe_vector: output of keyswitch should have
* size = 2 * (lwe_dimension + 1) * sizeof(Torus)
* big_lwe_vector: output of pbs should have
* size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
*/
template <typename Torus, typename STorus, class params>
void host_full_propagate_inplace(cuda_stream_t *stream, Torus *input_blocks,
int_fullprop_buffer<Torus> *mem_ptr,
Torus *ksk, void *bsk, uint32_t lwe_dimension,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_base_log,
uint32_t ks_level, uint32_t pbs_base_log,
uint32_t pbs_level, uint32_t grouping_factor,
uint32_t num_blocks) {
int big_lwe_size = (glwe_dimension * polynomial_size + 1);
int small_lwe_size = (lwe_dimension + 1);
for (int i = 0; i < num_blocks; i++) {
auto cur_input_block = &input_blocks[i * big_lwe_size];
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
stream, mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes,
cur_input_block, mem_ptr->lwe_indexes, ksk,
polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
1);
cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
mem_ptr->tmp_small_lwe_vector,
small_lwe_size * sizeof(Torus), stream);
execute_pbs<Torus>(
stream, mem_ptr->tmp_big_lwe_vector, mem_ptr->lwe_indexes,
mem_ptr->lut_buffer, mem_ptr->lut_indexes,
mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes, bsk,
mem_ptr->pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
pbs_base_log, pbs_level, grouping_factor, 2, 2, 0,
cuda_get_max_shared_memory(stream->gpu_index), mem_ptr->pbs_type);
cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
big_lwe_size * sizeof(Torus), stream);
if (i < num_blocks - 1) {
auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
host_addition(stream, next_input_block, next_input_block,
&mem_ptr->tmp_big_lwe_vector[big_lwe_size],
glwe_dimension * polynomial_size, 1);
}
}
}
template <typename Torus>
void scratch_cuda_full_propagation(
cuda_stream_t *stream, int_fullprop_buffer<Torus> **mem_ptr,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t pbs_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
// PBS
int8_t *pbs_buffer;
if (pbs_type == MULTI_BIT) {
uint32_t lwe_chunk_size =
get_average_lwe_chunk_size(lwe_dimension, pbs_level, glwe_dimension);
// Only 64 bits is supported
scratch_cuda_multi_bit_pbs_64(stream, &pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_level,
grouping_factor, num_radix_blocks,
cuda_get_max_shared_memory(stream->gpu_index),
allocate_gpu_memory, lwe_chunk_size);
} else {
// Classic
// We only use low latency for classic mode
if (sizeof(Torus) == sizeof(uint32_t))
scratch_cuda_bootstrap_low_latency_32(
stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
allocate_gpu_memory);
else
scratch_cuda_bootstrap_low_latency_64(
stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
allocate_gpu_memory);
}
// LUT
Torus *lut_buffer;
if (allocate_gpu_memory) {
// LUT is used as a trivial encryption, so we only allocate memory for the
// body
Torus lut_buffer_size =
2 * (glwe_dimension + 1) * polynomial_size * sizeof(Torus);
lut_buffer = (Torus *)cuda_malloc_async(lut_buffer_size, stream);
// LUTs
auto lut_f_message = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
return x / message_modulus;
};
//
Torus *lut_buffer_message = lut_buffer;
Torus *lut_buffer_carry =
lut_buffer + (glwe_dimension + 1) * polynomial_size;
generate_device_accumulator<Torus>(
stream, lut_buffer_message, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, lut_f_message);
generate_device_accumulator<Torus>(stream, lut_buffer_carry, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, lut_f_carry);
}
Torus *lut_indexes;
if (allocate_gpu_memory) {
lut_indexes = (Torus *)cuda_malloc_async(2 * sizeof(Torus), stream);
Torus h_lut_indexes[2] = {0, 1};
cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, 2 * sizeof(Torus),
stream);
}
Torus *lwe_indexes;
if (allocate_gpu_memory) {
Torus lwe_indexes_size = num_radix_blocks * sizeof(Torus);
lwe_indexes = (Torus *)cuda_malloc_async(lwe_indexes_size, stream);
Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
for (int i = 0; i < num_radix_blocks; i++)
h_lwe_indexes[i] = i;
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
stream);
cuda_synchronize_stream(stream);
free(h_lwe_indexes);
}
// Temporary arrays
Torus *small_lwe_vector;
Torus *big_lwe_vector;
if (allocate_gpu_memory) {
Torus small_vector_size = 2 * (lwe_dimension + 1) * sizeof(Torus);
Torus big_vector_size =
2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus);
small_lwe_vector = (Torus *)cuda_malloc_async(small_vector_size, stream);
big_lwe_vector = (Torus *)cuda_malloc_async(big_vector_size, stream);
}
*mem_ptr = new int_fullprop_buffer<Torus>;
(*mem_ptr)->pbs_type = pbs_type;
(*mem_ptr)->pbs_buffer = pbs_buffer;
(*mem_ptr)->lut_buffer = lut_buffer;
(*mem_ptr)->lut_indexes = lut_indexes;
(*mem_ptr)->lwe_indexes = lwe_indexes;
(*mem_ptr)->tmp_small_lwe_vector = small_lwe_vector;
(*mem_ptr)->tmp_big_lwe_vector = big_lwe_vector;
}
// (lwe_dimension+1) threads
// (num_radix_blocks / 2) thread blocks
template <typename Torus>
__global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
uint32_t lwe_dimension,
uint32_t num_radix_blocks, uint32_t factor) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < (lwe_dimension + 1)) {
for (int bid = 0; bid < (num_radix_blocks / 2); bid++) {
Torus *lsb_block = lwe_array_in + (2 * bid) * (lwe_dimension + 1);
Torus *msb_block = lsb_block + (lwe_dimension + 1);
Torus *packed_block = lwe_array_out + bid * (lwe_dimension + 1);
packed_block[tid] = lsb_block[tid] + factor * msb_block[tid];
}
if (num_radix_blocks % 2 != 0) {
// We couldn't pack the last block, so we just copy it
Torus *lsb_block =
lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
Torus *last_block =
lwe_array_out + (num_radix_blocks / 2) * (lwe_dimension + 1);
last_block[tid] = lsb_block[tid];
}
}
}
// Packs the low ciphertext in the message parts of the high ciphertext
// and moves the high ciphertext into the carry part.
//
// This requires the block parameters to have enough room for two ciphertexts,
// so at least as many carry modulus as the message modulus
//
// Expects the carry buffer to be empty
template <typename Torus>
__host__ void pack_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_in, uint32_t lwe_dimension,
uint32_t num_radix_blocks, uint32_t factor) {
assert(lwe_array_out != lwe_array_in);
int num_blocks = 0, num_threads = 0;
int num_entries = (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
device_pack_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
}
template <typename Torus>
__global__ void
device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input,
int32_t num_blocks, uint32_t lwe_dimension,
uint64_t delta) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num_blocks) {
Torus scalar = scalar_input[tid];
Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
*body = scalar * delta;
}
}
template <typename Torus>
__host__ void
create_trivial_radix(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *scalar_array, uint32_t lwe_dimension,
uint32_t num_radix_blocks, uint32_t num_scalar_blocks,
uint64_t message_modulus, uint64_t carry_modulus) {
size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream);
if (num_scalar_blocks == 0)
return;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = num_scalar_blocks;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
// Value of the shift we multiply our messages by
// If message_modulus and carry_modulus are always powers of 2 we can simplify
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_create_trivial_radix<<<grid, thds, 0, stream->stream>>>(
lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
check_cuda_error(cudaGetLastError());
}
#endif // TFHE_RS_INTERNAL_INTEGER_CUH

View File

@@ -0,0 +1,107 @@
#include "integer/multiplication.cuh"
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the integer radix multiplication in keyswitch->bootstrap order.
*/
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t message_modulus,
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor,
uint32_t num_radix_blocks, PBS_TYPE pbs_type, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
polynomial_size, lwe_dimension, ks_level, ks_base_log,
pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
switch (polynomial_size) {
case 2048:
scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
stream, (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
allocate_gpu_memory);
break;
default:
break;
}
}
/*
* Computes a multiplication between two 64 bit radix lwe ciphertexts
* encrypting integer values. keyswitch -> bootstrap pattern is used, function
* works for single pair of radix ciphertexts, 'v_stream' can be used for
* parallelization
* - 'v_stream' is a void pointer to the Cuda stream to be used in the kernel
* launch
* - 'gpu_index' is the index of the GPU to be used in the kernel launch
* - 'radix_lwe_out' is 64 bit radix big lwe ciphertext, product of
* multiplication
* - 'radix_lwe_left' left radix big lwe ciphertext
* - 'radix_lwe_right' right radix big lwe ciphertext
* - 'bsk' bootstrapping key in fourier domain
* - 'ksk' keyswitching key
* - 'mem_ptr'
* - 'message_modulus' message_modulus
* - 'carry_modulus' carry_modulus
* - 'glwe_dimension' glwe_dimension
* - 'lwe_dimension' is the dimension of small lwe ciphertext
* - 'polynomial_size' polynomial size
* - 'pbs_base_log' base log used in the pbs
* - 'pbs_level' decomposition level count used in the pbs
* - 'ks_level' decomposition level count used in the keyswitch
* - 'num_blocks' is the number of big lwe ciphertext blocks inside radix
* ciphertext
* - 'pbs_type' selects which PBS implementation should be used
* - 'max_shared_memory' maximum shared memory per cuda block
*/
void cuda_integer_mult_radix_ciphertext_kb_64(
cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_left,
void *radix_lwe_right, void *bsk, void *ksk, int8_t *mem_ptr,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 2048:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
default:
break;
}
}
void cleanup_cuda_integer_mult(cuda_stream_t *stream, int8_t **mem_ptr_void) {
int_mul_memory<uint64_t> *mem_ptr =
(int_mul_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(stream);
}
void cuda_small_scalar_multiplication_integer_radix_ciphertext_64_inplace(
cuda_stream_t *stream, void *lwe_array, uint64_t scalar,
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
cuda_small_scalar_multiplication_integer_radix_ciphertext_64(
stream, lwe_array, lwe_array, scalar, lwe_dimension,
lwe_ciphertext_count);
}
void cuda_small_scalar_multiplication_integer_radix_ciphertext_64(
cuda_stream_t *stream, void *output_lwe_array, void *input_lwe_array,
uint64_t scalar, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
host_integer_small_scalar_mult_radix(
stream, static_cast<uint64_t *>(output_lwe_array),
static_cast<uint64_t *>(input_lwe_array), scalar, lwe_dimension,
lwe_ciphertext_count);
}

View File

@@ -0,0 +1,639 @@
#ifndef CUDA_INTEGER_MULT_CUH
#define CUDA_INTEGER_MULT_CUH
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "bootstrap.h"
#include "bootstrap_multibit.h"
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.h"
#include "integer/integer.cuh"
#include "linear_algebra.h"
#include "pbs/bootstrap_amortized.cuh"
#include "pbs/bootstrap_low_latency.cuh"
#include "pbs/bootstrap_multibit.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <fstream>
#include <iostream>
#include <omp.h>
#include <sstream>
#include <string>
#include <vector>
template <typename Torus, class params>
__global__ void
all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
Torus *msb_ciphertext, Torus *radix_lwe_right,
Torus *lsb_rhs, Torus *msb_rhs, int num_blocks) {
size_t block_id = blockIdx.x;
double D = sqrt((2 * num_blocks + 1) * (2 * num_blocks + 1) - 8 * block_id);
size_t radix_id = int((2 * num_blocks + 1 - D) / 2.);
size_t local_block_id =
block_id - (2 * num_blocks - radix_id + 1) / 2. * radix_id;
bool process_msb = (local_block_id < (num_blocks - radix_id - 1));
auto cur_lsb_block = &lsb_ciphertext[block_id * (params::degree + 1)];
auto cur_msb_block =
(process_msb)
? &msb_ciphertext[(block_id - radix_id) * (params::degree + 1)]
: nullptr;
auto cur_lsb_rhs_block = &lsb_rhs[block_id * (params::degree + 1)];
auto cur_msb_rhs_block =
(process_msb) ? &msb_rhs[(block_id - radix_id) * (params::degree + 1)]
: nullptr;
auto cur_ct_right = &radix_lwe_right[radix_id * (params::degree + 1)];
auto cur_src = &radix_lwe_left[local_block_id * (params::degree + 1)];
size_t tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
Torus value = cur_src[tid];
if (process_msb) {
cur_lsb_block[tid] = cur_msb_block[tid] = value;
cur_lsb_rhs_block[tid] = cur_msb_rhs_block[tid] = cur_ct_right[tid];
} else {
cur_lsb_block[tid] = value;
cur_lsb_rhs_block[tid] = cur_ct_right[tid];
}
tid += params::degree / params::opt;
}
if (threadIdx.x == 0) {
Torus value = cur_src[params::degree];
if (process_msb) {
cur_lsb_block[params::degree] = cur_msb_block[params::degree] = value;
cur_lsb_rhs_block[params::degree] = cur_msb_rhs_block[params::degree] =
cur_ct_right[params::degree];
} else {
cur_lsb_block[params::degree] = value;
cur_lsb_rhs_block[params::degree] = cur_ct_right[params::degree];
}
}
}
template <typename Torus>
void compress_device_array_with_map(cuda_stream_t *stream, Torus *src,
Torus *dst, int *S, int *F, int num_blocks,
uint32_t map_size, uint32_t unit_size,
int &total_copied, bool is_message) {
for (int i = 0; i < map_size; i++) {
int s_index = i * num_blocks + S[i];
int number_of_unit = F[i] - S[i] + is_message;
auto cur_dst = &dst[total_copied * unit_size];
auto cur_src = &src[s_index * unit_size];
size_t copy_size = unit_size * number_of_unit * sizeof(Torus);
cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
total_copied += number_of_unit;
}
}
template <typename Torus>
void extract_message_carry_to_full_radix(cuda_stream_t *stream, Torus *src,
Torus *dst, int *S, int *F,
uint32_t map_size, uint32_t unit_size,
int &total_copied,
int &total_radix_copied,
int num_blocks, bool is_message) {
size_t radix_size = unit_size * num_blocks;
for (int i = 0; i < map_size; i++) {
auto cur_dst_radix = &dst[total_radix_copied * radix_size];
int s_index = S[i];
int number_of_unit = F[i] - s_index + is_message;
if (!is_message) {
int zero_block_count = num_blocks - number_of_unit;
cuda_memset_async(cur_dst_radix, 0,
zero_block_count * unit_size * sizeof(Torus), stream);
s_index = zero_block_count;
}
auto cur_dst = &cur_dst_radix[s_index * unit_size];
auto cur_src = &src[total_copied * unit_size];
size_t copy_size = unit_size * number_of_unit * sizeof(Torus);
cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
total_copied += number_of_unit;
++total_radix_copied;
}
}
template <typename Torus, class params>
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
uint32_t chunk_size, uint32_t num_blocks) {
extern __shared__ Torus result[];
size_t chunk_id = blockIdx.x;
size_t chunk_elem_size = chunk_size * num_blocks * (params::degree + 1);
size_t radix_elem_size = num_blocks * (params::degree + 1);
auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
size_t block_stride = blockIdx.y * (params::degree + 1);
auto dst_block = &dst_radix[block_stride];
// init shared mem with first radix of chunk
size_t tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
result[tid] = src_chunk[block_stride + tid];
tid += params::degree / params::opt;
}
if (threadIdx.x == 0) {
result[params::degree] = src_chunk[block_stride + params::degree];
}
// accumulate rest of the radixes
for (int r_id = 1; r_id < chunk_size; r_id++) {
auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
result[tid] += cur_src_radix[block_stride + tid];
tid += params::degree / params::opt;
}
if (threadIdx.x == 0) {
result[params::degree] += cur_src_radix[block_stride + params::degree];
}
}
// put result from shared mem to global mem
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
dst_block[tid] = result[tid];
tid += params::degree / params::opt;
}
if (threadIdx.x == 0) {
dst_block[params::degree] = result[params::degree];
}
}
template <typename Torus, class params>
__global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
Torus *msb_blocks,
uint32_t glwe_dimension,
uint32_t lsb_count, uint32_t msb_count,
uint32_t num_blocks) {
size_t big_lwe_dimension = glwe_dimension * params::degree + 1;
size_t big_lwe_id = blockIdx.x;
size_t radix_id = big_lwe_id / num_blocks;
size_t block_id = big_lwe_id % num_blocks;
size_t lsb_block_id = block_id - radix_id;
size_t msb_block_id = block_id - radix_id - 1;
bool process_lsb = (radix_id <= block_id);
bool process_msb = (radix_id + 1 <= block_id);
auto cur_res_lsb_ct = &result_blocks[big_lwe_id * big_lwe_dimension];
auto cur_res_msb_ct =
&result_blocks[num_blocks * num_blocks * big_lwe_dimension +
big_lwe_id * big_lwe_dimension];
Torus *cur_lsb_radix = &lsb_blocks[(2 * num_blocks - radix_id + 1) *
radix_id / 2 * (params::degree + 1)];
Torus *cur_msb_radix = (process_msb)
? &msb_blocks[(2 * num_blocks - radix_id - 1) *
radix_id / 2 * (params::degree + 1)]
: nullptr;
Torus *cur_lsb_ct = (process_lsb)
? &cur_lsb_radix[lsb_block_id * (params::degree + 1)]
: nullptr;
Torus *cur_msb_ct = (process_msb)
? &cur_msb_radix[msb_block_id * (params::degree + 1)]
: nullptr;
size_t tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
cur_res_lsb_ct[tid] = (process_lsb) ? cur_lsb_ct[tid] : 0;
cur_res_msb_ct[tid] = (process_msb) ? cur_msb_ct[tid] : 0;
tid += params::degree / params::opt;
}
if (threadIdx.x == 0) {
cur_res_lsb_ct[params::degree] =
(process_lsb) ? cur_lsb_ct[params::degree] : 0;
cur_res_msb_ct[params::degree] =
(process_msb) ? cur_msb_ct[params::degree] : 0;
}
}
template <typename Torus, typename STorus, class params>
__host__ void host_integer_mult_radix_kb(
cuda_stream_t *stream, uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
uint64_t *radix_lwe_right, void *bsk, uint64_t *ksk,
int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
auto glwe_dimension = mem_ptr->params.glwe_dimension;
auto polynomial_size = mem_ptr->params.polynomial_size;
auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
auto message_modulus = mem_ptr->params.message_modulus;
auto carry_modulus = mem_ptr->params.carry_modulus;
int big_lwe_dimension = glwe_dimension * polynomial_size;
int big_lwe_size = big_lwe_dimension + 1;
// 'vector_result_lsb' contains blocks from all possible right shifts of
// radix_lwe_left, only nonzero blocks are kept
int lsb_vector_block_count = num_blocks * (num_blocks + 1) / 2;
// 'vector_result_msb' contains blocks from all possible shifts of
// radix_lwe_left except the last blocks of each shift. Only nonzero blocks
// are kept
int msb_vector_block_count = num_blocks * (num_blocks - 1) / 2;
// total number of blocks msb and lsb
int total_block_count = lsb_vector_block_count + msb_vector_block_count;
// buffer to keep all lsb and msb shifts
// for lsb all nonzero blocks of each right shifts are kept
// for 0 shift num_blocks blocks
// for 1 shift num_blocks - 1 blocks
// for num_blocks - 1 shift 1 block
// (num_blocks + 1) * num_blocks / 2 blocks
// for msb we don't keep track for last blocks so
// for 0 shift num_blocks - 1 blocks
// for 1 shift num_blocks - 2 blocks
// for num_blocks - 1 shift 0 blocks
// (num_blocks - 1) * num_blocks / 2 blocks
// in total num_blocks^2 blocks
// in each block three is big polynomial with
// glwe_dimension * polynomial_size + 1 coefficients
auto vector_result_sb = mem_ptr->vector_result_sb;
// buffer to keep lsb_vector + msb_vector
// addition will happen in full terms so there will be
// num_blocks terms and each term will have num_blocks block
// num_blocks^2 blocks in total
// and each blocks has big lwe ciphertext with
// glwe_dimension * polynomial_size + 1 coefficients
auto block_mul_res = mem_ptr->block_mul_res;
// buffer to keep keyswitch result of num_blocks^2 ciphertext
// in total it has num_blocks^2 small lwe ciphertexts with
// lwe_dimension +1 coefficients
auto small_lwe_vector = mem_ptr->small_lwe_vector;
// buffer to keep pbs result for num_blocks^2 lwe_ciphertext
// in total it has num_blocks^2 big lwe ciphertexts with
// glwe_dimension * polynomial_size + 1 coefficients
auto lwe_pbs_out_array = mem_ptr->lwe_pbs_out_array;
// it contains two test vector, first for lsb extraction,
// second for msb extraction, with total length =
// 2 * (glwe_dimension + 1) * polynomial_size
auto test_vector_array = mem_ptr->test_vector_array;
// accumulator to extract message
// with length (glwe_dimension + 1) * polynomial_size
auto test_vector_message = mem_ptr->test_vector_message;
// accumulator to extract carry
// with length (glwe_dimension + 1) * polynomial_size
auto test_vector_carry = mem_ptr->test_vector_carry;
// to be used as default indexing
auto lwe_indexes = test_vector_array->lwe_indexes;
auto vector_result_lsb = &vector_result_sb[0];
auto vector_result_msb =
&vector_result_sb[lsb_vector_block_count *
(polynomial_size * glwe_dimension + 1)];
auto vector_lsb_rhs = &block_mul_res[0];
auto vector_msb_rhs = &block_mul_res[lsb_vector_block_count *
(polynomial_size * glwe_dimension + 1)];
dim3 grid(lsb_vector_block_count, 1, 1);
dim3 thds(params::degree / params::opt, 1, 1);
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, stream->stream>>>(
radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
vector_lsb_rhs, vector_msb_rhs, num_blocks);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, block_mul_res, block_mul_res, vector_result_sb, bsk, ksk,
total_block_count, test_vector_array);
vector_result_lsb = &block_mul_res[0];
vector_result_msb = &block_mul_res[lsb_vector_block_count *
(polynomial_size * glwe_dimension + 1)];
fill_radix_from_lsb_msb<Torus, params>
<<<num_blocks * num_blocks, params::degree / params::opt, 0,
stream->stream>>>(vector_result_sb, vector_result_lsb,
vector_result_msb, glwe_dimension,
lsb_vector_block_count, msb_vector_block_count,
num_blocks);
auto new_blocks = block_mul_res;
auto old_blocks = vector_result_sb;
// amount of current radixes after block_mul
size_t r = 2 * num_blocks;
size_t total_modulus = message_modulus * carry_modulus;
size_t message_max = message_modulus - 1;
size_t chunk_size = (total_modulus - 1) / message_max;
size_t ch_amount = r / chunk_size;
int terms_degree[r * num_blocks];
int f_b[ch_amount];
int l_b[ch_amount];
for (int i = 0; i < num_blocks * num_blocks; i++) {
size_t r_id = i / num_blocks;
size_t b_id = i % num_blocks;
terms_degree[i] = (b_id >= r_id) ? 3 : 0;
}
auto terms_degree_msb = &terms_degree[num_blocks * num_blocks];
for (int i = 0; i < num_blocks * num_blocks; i++) {
size_t r_id = i / num_blocks;
size_t b_id = i % num_blocks;
terms_degree_msb[i] = (b_id > r_id) ? 2 : 0;
}
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
while (r > chunk_size) {
int cur_total_blocks = r * num_blocks;
ch_amount = r / chunk_size;
dim3 add_grid(ch_amount, num_blocks, 1);
size_t sm_size = big_lwe_size * sizeof(Torus);
cuda_memset_async(new_blocks, 0,
ch_amount * num_blocks * big_lwe_size * sizeof(Torus),
stream);
tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
new_blocks, old_blocks, chunk_size, num_blocks);
for (int c_id = 0; c_id < ch_amount; c_id++) {
auto cur_chunk = &terms_degree[c_id * chunk_size * num_blocks];
int mx = 0;
int mn = num_blocks;
for (int r_id = 1; r_id < chunk_size; r_id++) {
auto cur_radix = &cur_chunk[r_id * num_blocks];
for (int i = 0; i < num_blocks; i++) {
if (cur_radix[i]) {
mn = min(mn, i);
mx = max(mx, i);
}
}
}
f_b[c_id] = mn;
l_b[c_id] = mx;
}
int total_copied = 0;
int message_count = 0;
int carry_count = 0;
compress_device_array_with_map<Torus>(stream, new_blocks, old_blocks, f_b,
l_b, num_blocks, ch_amount,
big_lwe_size, total_copied, true);
message_count = total_copied;
compress_device_array_with_map<Torus>(stream, new_blocks, old_blocks, f_b,
l_b, num_blocks, ch_amount,
big_lwe_size, total_copied, false);
carry_count = total_copied - message_count;
auto message_blocks_vector = old_blocks;
auto carry_blocks_vector =
&old_blocks[message_count * (glwe_dimension * polynomial_size + 1)];
cuda_keyswitch_lwe_ciphertext_vector(
stream, small_lwe_vector, lwe_indexes, old_blocks, lwe_indexes, ksk,
polynomial_size * glwe_dimension, lwe_dimension,
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_copied);
execute_pbs<Torus>(
stream, message_blocks_vector, lwe_indexes, test_vector_message->lut,
test_vector_message->lut_indexes, small_lwe_vector, lwe_indexes, bsk,
test_vector_message->pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
message_count, 1, 0, max_shared_memory, mem_ptr->params.pbs_type);
execute_pbs<Torus>(stream, carry_blocks_vector, lwe_indexes,
test_vector_carry->lut, test_vector_carry->lut_indexes,
&small_lwe_vector[message_count * (lwe_dimension + 1)],
lwe_indexes, bsk, test_vector_carry->pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, carry_count, 1, 0,
max_shared_memory, mem_ptr->params.pbs_type);
int rem_blocks = r % chunk_size * num_blocks;
int new_blocks_created = 2 * ch_amount * num_blocks;
int copy_size = rem_blocks * big_lwe_size * sizeof(Torus);
auto cur_dst = &new_blocks[new_blocks_created * big_lwe_size];
auto cur_src = &old_blocks[(cur_total_blocks - rem_blocks) * big_lwe_size];
cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
total_copied = 0;
int total_radix_copied = 0;
extract_message_carry_to_full_radix<Torus>(
stream, old_blocks, new_blocks, f_b, l_b, ch_amount, big_lwe_size,
total_copied, total_radix_copied, num_blocks, true);
extract_message_carry_to_full_radix<Torus>(
stream, old_blocks, new_blocks, f_b, l_b, ch_amount, big_lwe_size,
total_copied, total_radix_copied, num_blocks, false);
std::swap(new_blocks, old_blocks);
r = (new_blocks_created + rem_blocks) / num_blocks;
}
dim3 add_grid(1, num_blocks, 1);
size_t sm_size = big_lwe_size * sizeof(Torus);
cuda_memset_async(radix_lwe_out, 0, num_blocks * big_lwe_size * sizeof(Torus),
stream);
tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
radix_lwe_out, old_blocks, r, num_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, vector_result_sb, radix_lwe_out, bsk, ksk, num_blocks,
test_vector_message);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, &block_mul_res[big_lwe_size], radix_lwe_out, bsk, ksk, num_blocks,
test_vector_carry);
cuda_memset_async(block_mul_res, 0, big_lwe_size * sizeof(Torus), stream);
host_addition(stream, radix_lwe_out, vector_result_sb, block_mul_res,
big_lwe_size, num_blocks);
host_propagate_single_carry_low_latency<Torus>(
stream, radix_lwe_out, mem_ptr->scp_mem, bsk, ksk, num_blocks);
}
template <typename Torus>
__host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
cuda_stream_t *stream, int_mul_memory<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
*mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
allocate_gpu_memory);
}
// Function to apply lookup table,
// It has two mode
// lsb_msb_mode == true - extracts lsb and msb
// lsb_msb_mode == false - extracts message and carry
template <typename Torus, typename STorus, class params>
void apply_lookup_table(Torus *input_ciphertexts, Torus *output_ciphertexts,
int_mul_memory<Torus> *mem_ptr, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t pbs_base_log, uint32_t pbs_level,
uint32_t ks_base_log, uint32_t ks_level,
uint32_t grouping_factor,
uint32_t lsb_message_blocks_count,
uint32_t msb_carry_blocks_count,
uint32_t max_shared_memory, bool lsb_msb_mode) {
int total_blocks_count = lsb_message_blocks_count + msb_carry_blocks_count;
int gpu_n = mem_ptr->p2p_gpu_count;
if (total_blocks_count < gpu_n)
gpu_n = total_blocks_count;
int gpu_blocks_count = total_blocks_count / gpu_n;
int big_lwe_size = glwe_dimension * polynomial_size + 1;
// int small_lwe_size = lwe_dimension + 1;
#pragma omp parallel for num_threads(gpu_n)
for (int i = 0; i < gpu_n; i++) {
cudaSetDevice(i);
auto this_stream = mem_ptr->streams[i];
// Index where input and output blocks start for current gpu
int big_lwe_start_index = i * gpu_blocks_count * big_lwe_size;
// Last gpu might have extra blocks to process if total blocks number is not
// divisible by gpu_n
if (i == gpu_n - 1) {
gpu_blocks_count += total_blocks_count % gpu_n;
}
int can_access_peer;
cudaDeviceCanAccessPeer(&can_access_peer, i, 0);
if (i == 0) {
check_cuda_error(
cudaMemcpyAsync(mem_ptr->pbs_output_multi_gpu[i],
&input_ciphertexts[big_lwe_start_index],
gpu_blocks_count * big_lwe_size * sizeof(Torus),
cudaMemcpyDeviceToDevice, *this_stream));
} else if (can_access_peer) {
check_cuda_error(cudaMemcpyPeerAsync(
mem_ptr->pbs_output_multi_gpu[i], i,
&input_ciphertexts[big_lwe_start_index], 0,
gpu_blocks_count * big_lwe_size * sizeof(Torus), *this_stream));
} else {
// Uses host memory as middle ground
cuda_memcpy_async_to_cpu(mem_ptr->device_to_device_buffer[i],
&input_ciphertexts[big_lwe_start_index],
gpu_blocks_count * big_lwe_size * sizeof(Torus),
this_stream, i);
cuda_memcpy_async_to_gpu(
mem_ptr->pbs_output_multi_gpu[i], mem_ptr->device_to_device_buffer[i],
gpu_blocks_count * big_lwe_size * sizeof(Torus), this_stream, i);
}
// when lsb and msb have to be extracted
// for first lsb_count blocks we need lsb_acc
// for last msb_count blocks we need msb_acc
// when message and carry have tobe extracted
// for first message_count blocks we need message_acc
// for last carry_count blocks we need carry_acc
Torus *cur_tvi;
if (lsb_msb_mode) {
cur_tvi = (big_lwe_start_index < lsb_message_blocks_count)
? mem_ptr->tvi_lsb_multi_gpu[i]
: mem_ptr->tvi_msb_multi_gpu[i];
} else {
cur_tvi = (big_lwe_start_index < lsb_message_blocks_count)
? mem_ptr->tvi_message_multi_gpu[i]
: mem_ptr->tvi_carry_multi_gpu[i];
}
// execute keyswitch on a current gpu with corresponding input and output
// blocks pbs_output_multi_gpu[i] is an input for keyswitch and
// pbs_input_multi_gpu[i] is an output for keyswitch
cuda_keyswitch_lwe_ciphertext_vector(
this_stream, i, mem_ptr->pbs_input_multi_gpu[i],
mem_ptr->pbs_output_multi_gpu[i], mem_ptr->ksk_multi_gpu[i],
polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
gpu_blocks_count);
// execute pbs on a current gpu with corresponding input and output
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
this_stream, i, mem_ptr->pbs_output_multi_gpu[i],
mem_ptr->test_vector_multi_gpu[i], cur_tvi,
mem_ptr->pbs_input_multi_gpu[i], mem_ptr->bsk_multi_gpu[i],
mem_ptr->pbs_buffer_multi_gpu[i], lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, pbs_base_log, pbs_level,
grouping_factor, gpu_blocks_count, 2, 0, max_shared_memory);
// lookup table is applied and now data from current gpu have to be copied
// back to gpu_0 in 'output_ciphertexts' buffer
if (i == 0) {
check_cuda_error(
cudaMemcpyAsync(&output_ciphertexts[big_lwe_start_index],
mem_ptr->pbs_output_multi_gpu[i],
gpu_blocks_count * big_lwe_size * sizeof(Torus),
cudaMemcpyDeviceToDevice, *this_stream));
} else if (can_access_peer) {
check_cuda_error(cudaMemcpyPeerAsync(
&output_ciphertexts[big_lwe_start_index], 0,
mem_ptr->pbs_output_multi_gpu[i], i,
gpu_blocks_count * big_lwe_size * sizeof(Torus), *this_stream));
} else {
// Uses host memory as middle ground
cuda_memcpy_async_to_cpu(
mem_ptr->device_to_device_buffer[i], mem_ptr->pbs_output_multi_gpu[i],
gpu_blocks_count * big_lwe_size * sizeof(Torus), this_stream, i);
cuda_memcpy_async_to_gpu(&output_ciphertexts[big_lwe_start_index],
mem_ptr->device_to_device_buffer[i],
gpu_blocks_count * big_lwe_size * sizeof(Torus),
this_stream, i);
}
}
}
template <typename T>
__global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
T *input_lwe_array,
T scalar,
uint32_t lwe_dimension,
uint32_t num_blocks) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int lwe_size = lwe_dimension + 1;
if (index < num_blocks * lwe_size) {
// Here we take advantage of the wrapping behaviour of uint
output_lwe_array[index] = input_lwe_array[index] * scalar;
}
}
template <typename T>
__host__ void host_integer_small_scalar_mult_radix(
cuda_stream_t *stream, T *output_lwe_array, T *input_lwe_array, T scalar,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(stream->gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count * lwe_size;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
device_small_scalar_radix_multiplication<<<grid, thds, 0, stream->stream>>>(
output_lwe_array, input_lwe_array, scalar, input_lwe_dimension,
input_lwe_ciphertext_count);
check_cuda_error(cudaGetLastError());
}
#endif

View File

@@ -0,0 +1,12 @@
#include "integer/negation.cuh"
void cuda_negate_integer_radix_ciphertext_64_inplace(
cuda_stream_t *stream, void *lwe_array, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus) {
host_integer_radix_negation(stream, static_cast<uint64_t *>(lwe_array),
static_cast<uint64_t *>(lwe_array), lwe_dimension,
lwe_ciphertext_count, message_modulus,
carry_modulus);
}

View File

@@ -0,0 +1,79 @@
#ifndef CUDA_INTEGER_NEGATE_CUH
#define CUDA_INTEGER_NEGATE_CUH
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "device.h"
#include "integer.h"
#include "utils/kernel_dimensions.cuh"
template <typename Torus>
__global__ void
device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
uint64_t lwe_dimension, uint64_t message_modulus,
uint64_t carry_modulus, uint64_t delta) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < lwe_dimension + 1) {
bool is_body = (tid == lwe_dimension);
// z = ceil( degree / 2^p ) * 2^p
uint64_t z = (2 * message_modulus - 1) / message_modulus;
__syncthreads();
z *= message_modulus;
// (0,Delta*z) - ct
output[tid] = (is_body ? z * delta - input[tid] : -input[tid]);
for (int radix_block_id = 1; radix_block_id < num_blocks;
radix_block_id++) {
tid += (lwe_dimension + 1);
// Subtract z/B to the next ciphertext to compensate for the addition of z
uint64_t zb = z / message_modulus;
uint64_t encoded_zb = zb * delta;
__syncthreads();
// (0,Delta*z) - ct
output[tid] =
(is_body ? z * delta - (input[tid] + encoded_zb) : -input[tid]);
__syncthreads();
}
}
}
template <typename Torus>
__host__ void host_integer_radix_negation(cuda_stream_t *stream, Torus *output,
Torus *input, uint32_t lwe_dimension,
uint32_t input_lwe_ciphertext_count,
uint64_t message_modulus,
uint64_t carry_modulus) {
cudaSetDevice(stream->gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = lwe_dimension + 1;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = lwe_size;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
uint64_t shared_mem = input_lwe_ciphertext_count * sizeof(uint32_t);
// Value of the shift we multiply our messages by
// If message_modulus and carry_modulus are always powers of 2 we can simplify
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_negation<<<grid, thds, shared_mem, stream->stream>>>(
output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
carry_modulus, delta);
check_cuda_error(cudaGetLastError());
}
#endif

View File

@@ -0,0 +1,12 @@
#include "integer/scalar_addition.cuh"
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
cuda_stream_t *stream, void *lwe_array, void *scalar_input,
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus) {
host_integer_radix_scalar_addition_inplace(
stream, static_cast<uint64_t *>(lwe_array),
static_cast<uint64_t *>(scalar_input), lwe_dimension,
lwe_ciphertext_count, message_modulus, carry_modulus);
}

View File

@@ -0,0 +1,130 @@
#ifndef CUDA_INTEGER_ADD_CUH
#define CUDA_INTEGER_ADD_CUH
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "device.h"
#include "integer.h"
#include "utils/kernel_dimensions.cuh"
#include <stdio.h>
template <typename Torus>
__global__ void device_integer_radix_scalar_addition_inplace(
Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
uint32_t lwe_dimension, uint64_t delta) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num_blocks) {
Torus scalar = scalar_input[tid];
Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
*body += scalar * delta;
}
}
template <typename Torus>
__host__ void host_integer_radix_scalar_addition_inplace(
cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus) {
cudaSetDevice(stream->gpu_index);
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
// Value of the shift we multiply our messages by
// If message_modulus and carry_modulus are always powers of 2 we can simplify
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_scalar_addition_inplace<<<grid, thds, 0,
stream->stream>>>(
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
delta);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
__global__ void device_integer_radix_add_scalar_one_inplace(
Torus *lwe_array, int32_t num_blocks, uint32_t lwe_dimension,
uint64_t delta) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num_blocks) {
Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
*body += delta;
}
}
template <typename Torus>
__host__ void host_integer_radix_add_scalar_one_inplace(
cuda_stream_t *stream, Torus *lwe_array, uint32_t lwe_dimension,
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus) {
cudaSetDevice(stream->gpu_index);
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
// Value of the shift we multiply our messages by
// If message_modulus and carry_modulus are always powers of 2 we can simplify
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0,
stream->stream>>>(
lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
__global__ void device_integer_radix_scalar_subtraction_inplace(
Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
uint32_t lwe_dimension, uint64_t delta) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num_blocks) {
Torus scalar = scalar_input[tid];
Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
*body -= scalar * delta;
}
}
template <typename Torus>
__host__ void host_integer_radix_scalar_subtraction_inplace(
cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus) {
cudaSetDevice(stream->gpu_index);
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
// Value of the shift we multiply our messages by
// If message_modulus and carry_modulus are always powers of 2 we can simplify
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
stream->stream>>>(
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
delta);
check_cuda_error(cudaGetLastError());
}
#endif

View File

@@ -0,0 +1,14 @@
#include "integer/scalar_bitops.cuh"
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_input,
void *clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr, void *bsk,
void *ksk, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
host_integer_radix_scalar_bitop_kb<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_input),
static_cast<uint64_t *>(clear_blocks), num_clear_blocks,
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
lwe_ciphertext_count, op);
}

View File

@@ -0,0 +1,51 @@
#ifndef CUDA_INTEGER_SCALAR_BITWISE_OPS_CUH
#define CUDA_INTEGER_SCALAR_BITWISE_OPS_CUH
#include "integer/bitwise_ops.cuh"
#include <omp.h>
template <typename Torus>
__host__ void host_integer_radix_scalar_bitop_kb(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_input,
Torus *clear_blocks, uint32_t num_clear_blocks,
int_bitop_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
uint32_t num_radix_blocks, BITOP_TYPE op) {
auto lut = mem_ptr->lut;
auto params = lut->params;
auto big_lwe_dimension = params.big_lwe_dimension;
uint32_t lwe_size = big_lwe_dimension + 1;
if (num_clear_blocks == 0) {
if (op == SCALAR_BITAND) {
auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
cuda_memset_async(lwe_array_out, 0,
num_radix_blocks * lwe_size * sizeof(Torus), stream);
} else {
cuda_memcpy_async_gpu_to_gpu(lwe_array_out, lwe_array_input,
num_radix_blocks * lwe_size * sizeof(Torus),
stream);
}
} else {
auto lut_buffer = lut->lut;
// We have all possible LUTs pre-computed and we use the decomposed scalar
// as index to recover the right one
cuda_memcpy_async_gpu_to_gpu(lut->lut_indexes, clear_blocks,
num_clear_blocks * sizeof(Torus), stream);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, lwe_array_out, lwe_array_input, bsk, ksk, num_clear_blocks,
lut);
if (op == SCALAR_BITAND) {
auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
cuda_memset_async(lwe_array_out_block, 0,
(num_radix_blocks - num_clear_blocks) * lwe_size *
sizeof(Torus),
stream);
}
}
}
#endif

View File

@@ -0,0 +1,44 @@
#include "integer/scalar_comparison.cuh"
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *scalar_blocks, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks) {
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
switch (buffer->op) {
// case EQ:
// case NE:
// host_integer_radix_equality_check_kb<uint64_t>(
// stream, static_cast<uint64_t *>(lwe_array_out),
// static_cast<uint64_t *>(lwe_array_1),
// static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
// static_cast<uint64_t *>(ksk), glwe_dimension, polynomial_size,
// big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log,
// pbs_level, pbs_base_log, grouping_factor, lwe_ciphertext_count,
// message_modulus, carry_modulus);
// break;
case GT:
case GE:
case LT:
case LE:
host_integer_radix_scalar_difference_check_kb<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(scalar_blocks), buffer,
buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
lwe_ciphertext_count, num_scalar_blocks);
break;
case MAX:
case MIN:
host_integer_radix_scalar_maxmin_kb<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
break;
default:
printf("Not implemented\n");
}
}

View File

@@ -0,0 +1,298 @@
#ifndef CUDA_INTEGER_SCALAR_COMPARISON_OPS_CUH
#define CUDA_INTEGER_SCALAR_COMPARISON_OPS_CUH
#include "integer/comparison.cuh"
#include <omp.h>
template <typename Torus>
__host__ void host_integer_radix_scalar_difference_check_kb(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
auto diff_buffer = mem_ptr->diff_buffer;
size_t big_lwe_size = big_lwe_dimension + 1;
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
// Reducing the signs is the bottleneck of the comparison algorithms,
// however if the scalar case there is an improvement:
//
// The idea is to reduce the number of signs block we have to
// reduce. We can do that by splitting the comparison problem in two parts.
//
// - One part where we compute the signs block between the scalar with just
// enough blocks
// from the ciphertext that can represent the scalar value
//
// - The other part is to compare the ciphertext blocks not considered for the
// sign
// computation with zero, and create a single sign block from that.
//
// The smaller the scalar value is compared to the ciphertext num bits
// encrypted, the more the comparisons with zeros we have to do, and the less
// signs block we will have to reduce.
//
// This will create a speedup as comparing a bunch of blocks with 0
// is faster
if (total_num_scalar_blocks == 0) {
// We only have to compare blocks with zero
// means scalar is zero
host_compare_with_zero_equality(stream, mem_ptr->tmp_lwe_array_out,
lwe_array_in, mem_ptr, bsk, ksk,
total_num_radix_blocks);
auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
return sign_handler_f(x);
};
auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, scalar_last_leaf_lut_f);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsk, ksk, 1, lut);
// The result will be in the two first block. Everything else is
// garbage.
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
big_lwe_size_bytes * (total_num_radix_blocks - 1),
stream);
} else if (total_num_scalar_blocks < total_num_radix_blocks) {
// We have to handle both part of the work described above
uint32_t num_lsb_radix_blocks = total_num_scalar_blocks;
uint32_t num_msb_radix_blocks =
total_num_radix_blocks - num_lsb_radix_blocks;
auto lsb = lwe_array_in;
auto msb = lwe_array_in + num_lsb_radix_blocks * big_lwe_size;
auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;
cuda_synchronize_stream(stream);
auto lsb_stream = diff_buffer->lsb_stream;
auto msb_stream = diff_buffer->msb_stream;
#pragma omp parallel sections
{
// Both sections may be executed in parallel
#pragma omp section
{
//////////////
// lsb
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_stream, rhs, scalar_blocks, 0, total_num_scalar_blocks,
message_modulus);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
// comparisons will be assigned
// - 0 if lhs < rhs
// - 1 if lhs == rhs
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb(lsb_stream, comparisons, lhs, rhs,
mem_ptr, bsk, ksk, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(lsb_stream, lwe_array_lsb_out, comparisons,
mem_ptr->diff_buffer->tree_buffer,
mem_ptr->cleaning_lut_f, bsk, ksk,
num_lsb_radix_blocks);
}
#pragma omp section
{
//////////////
// msb
host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
mem_ptr, bsk, ksk,
num_msb_radix_blocks);
}
}
cuda_synchronize_stream(lsb_stream);
cuda_synchronize_stream(msb_stream);
//////////////
// Reduce the two blocks into one final
auto scalar_bivariate_last_leaf_lut_f =
[sign_handler_f](Torus lsb, Torus msb) -> Torus {
if (msb == 1)
return sign_handler_f(lsb);
else
return sign_handler_f(IS_SUPERIOR);
};
auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator_bivariate<Torus>(
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, scalar_bivariate_last_leaf_lut_f);
integer_radix_apply_bivariate_lookup_table_kb(
stream, lwe_array_out, lwe_array_lsb_out, lwe_array_msb_out, bsk, ksk,
1, lut);
// The result will be in the first block. Everything else is garbage.
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
(total_num_radix_blocks - 1) * big_lwe_size_bytes,
stream);
} else {
// We only have to do the regular comparison
// And not the part where we compare most significant blocks with zeros
// total_num_radix_blocks == total_num_scalar_blocks
uint32_t num_lsb_radix_blocks = total_num_radix_blocks;
uint32_t num_scalar_blocks = total_num_scalar_blocks;
auto lsb = lwe_array_in;
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
pack_blocks(stream, lhs, lwe_array_in, big_lwe_dimension,
num_lsb_radix_blocks, message_modulus);
pack_blocks(stream, rhs, scalar_blocks, 0, num_scalar_blocks,
message_modulus);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
num_scalar_blocks /= 2;
// comparisons will be assigned
// - 0 if lhs < rhs
// - 1 if lhs == rhs
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_lwe_array_out;
scalar_compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk,
ksk, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(stream, lwe_array_out, comparisons,
mem_ptr->diff_buffer->tree_buffer, sign_handler_f, bsk,
ksk, num_lsb_radix_blocks);
// The result will be in the first block. Everything else is garbage.
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
(total_num_radix_blocks - 1) * big_lwe_size_bytes,
stream);
}
}
template <typename Torus>
__host__ void
scalar_compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto params = mem_ptr->params;
auto pbs_type = params.pbs_type;
auto big_lwe_dimension = params.big_lwe_dimension;
auto small_lwe_dimension = params.small_lwe_dimension;
auto ks_level = params.ks_level;
auto ks_base_log = params.ks_base_log;
auto pbs_level = params.pbs_level;
auto pbs_base_log = params.pbs_base_log;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto grouping_factor = params.grouping_factor;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
// When rhs > lhs, the subtraction will overflow, and the bit of padding will
// be set to 1
// meaning that the output of the pbs will be the negative (modulo message
// space)
//
// Example:
// lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
// lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
// Since there was an overflow the bit of padding is 1 and not 0.
// When applying the LUT for an input value of 14 we would expect 1,
// but since the bit of padding is 1, we will get -1 modulus our message
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
cuda_memcpy_async_gpu_to_gpu(
subtracted_blocks, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
// Subtract
// Here we need the true lwe sub, not the one that comes from shortint.
host_integer_radix_scalar_subtraction_inplace(
stream, subtracted_blocks, scalar_blocks, big_lwe_dimension,
num_radix_blocks, message_modulus, carry_modulus);
// Apply LUT to compare to 0
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out,
subtracted_blocks, bsk, ksk,
num_radix_blocks, sign_lut);
// Add one
// Here Lhs can have the following values: (-1) % (message modulus * carry
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
big_lwe_dimension, num_radix_blocks,
message_modulus, carry_modulus);
}
template <typename Torus>
__host__ void host_integer_radix_scalar_maxmin_kb(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t total_num_radix_blocks,
uint32_t total_num_scalar_blocks) {
auto params = mem_ptr->params;
// Calculates the difference sign between the ciphertext and the scalar
// - 0 if lhs < rhs
// - 1 if lhs == rhs
// - 2 if lhs > rhs
auto sign = mem_ptr->tmp_lwe_array_out;
host_integer_radix_scalar_difference_check_kb(
stream, sign, lwe_array_in, scalar_blocks, mem_ptr,
mem_ptr->cleaning_lut_f, bsk, ksk, total_num_radix_blocks,
total_num_scalar_blocks);
// There is no optimized CMUX for scalars, so we convert to a trivial
// ciphertext
auto lwe_array_left = lwe_array_in;
auto lwe_array_right = mem_ptr->tmp_block_comparisons;
create_trivial_radix(stream, lwe_array_right, scalar_blocks,
params.big_lwe_dimension, total_num_radix_blocks,
total_num_scalar_blocks, params.message_modulus,
params.carry_modulus);
// Selector
// CMUX for Max or Min
host_integer_radix_cmux_kb(
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
}
#endif

View File

@@ -0,0 +1,40 @@
#include "scalar_rotate.cuh"
void scratch_cuda_integer_radix_scalar_rotate_kb_64(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, SHIFT_TYPE shift_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
stream, (int_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
shift_type, allocate_gpu_memory);
}
void cuda_integer_radix_scalar_rotate_kb_64_inplace(cuda_stream_t *stream,
void *lwe_array, uint32_t n,
int8_t *mem_ptr, void *bsk,
void *ksk,
uint32_t num_blocks) {
host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array), n,
(int_shift_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
num_blocks);
}
void cleanup_cuda_integer_radix_scalar_rotate(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_shift_buffer<uint64_t> *mem_ptr =
(int_shift_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(stream);
}

View File

@@ -0,0 +1,114 @@
#ifndef CUDA_INTEGER_SCALAR_ROTATE_OPS_CUH
#define CUDA_INTEGER_SCALAR_ROTATE_OPS_CUH
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.cuh"
#include "integer.h"
#include "pbs/bootstrap_low_latency.cuh"
#include "pbs/bootstrap_multibit.cuh"
#include "types/complex/operations.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#ifndef CUDA_INTEGER_SHIFT_OPS_CUH
#define CUDA_INTEGER_SHIFT_OPS_CUH
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.cuh"
#include "integer.h"
#include "pbs/bootstrap_low_latency.cuh"
#include "pbs/bootstrap_multibit.cuh"
#include "types/complex/operations.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
template <typename Torus>
__host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
cuda_stream_t *stream, int_shift_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, SHIFT_TYPE shift_type,
bool allocate_gpu_memory) {
*mem_ptr = new int_shift_buffer<Torus>(stream, shift_type, params,
num_radix_blocks, allocate_gpu_memory);
}
template <typename Torus>
__host__ void host_integer_radix_scalar_rotate_kb_inplace(
cuda_stream_t *stream, Torus *lwe_array, uint32_t n,
int_shift_buffer<Torus> *mem, void *bsk, Torus *ksk, uint32_t num_blocks) {
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
size_t big_lwe_size = glwe_dimension * polynomial_size + 1;
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
size_t num_bits_in_message = (size_t)log2(message_modulus);
size_t total_num_bits = num_bits_in_message * num_blocks;
n = n % total_num_bits;
if (n == 0) {
return;
}
size_t rotations = n / num_bits_in_message;
size_t shift_within_block = n % num_bits_in_message;
Torus *rotated_buffer = mem->tmp_rotated;
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
// rotate right all the blocks in radix ciphertext
// copy result in new buffer
// 256 threads are used in every block
// block_count blocks will be used in the grid
// one block is responsible to process single lwe ciphertext
if (mem->shift_type == LEFT_SHIFT) {
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, stream);
if (shift_within_block == 0) {
return;
}
auto receiver_blocks = lwe_array;
auto giver_blocks = rotated_buffer;
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
giver_blocks, lwe_array, 1, num_blocks, big_lwe_size);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, lwe_array, receiver_blocks, giver_blocks, bsk, ksk, num_blocks,
lut_bivariate);
} else {
// left shift
radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, stream);
if (shift_within_block == 0) {
return;
}
auto receiver_blocks = lwe_array;
auto giver_blocks = rotated_buffer;
radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
giver_blocks, lwe_array, 1, num_blocks, big_lwe_size);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, lwe_array, receiver_blocks, giver_blocks, bsk, ksk, num_blocks,
lut_bivariate);
}
}
#endif // CUDA_SCALAR_OPS_CUH
#endif // CUDA_INTEGER_SCALAR_ROTATE_OPS_CUH

View File

@@ -0,0 +1,38 @@
#include "scalar_shifts.cuh"
void scratch_cuda_integer_radix_scalar_shift_kb_64(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, SHIFT_TYPE shift_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
scratch_cuda_integer_radix_scalar_shift_kb<uint64_t>(
stream, (int_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
shift_type, allocate_gpu_memory);
}
void cuda_integer_radix_scalar_shift_kb_64_inplace(
cuda_stream_t *stream, void *lwe_array, uint32_t shift, int8_t *mem_ptr,
void *bsk, void *ksk, uint32_t num_blocks) {
host_integer_radix_scalar_shift_kb_inplace<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array), shift,
(int_shift_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
num_blocks);
}
void cleanup_cuda_integer_radix_scalar_shift(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_shift_buffer<uint64_t> *mem_ptr =
(int_shift_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(stream);
}

View File

@@ -0,0 +1,125 @@
#ifndef CUDA_INTEGER_SHIFT_OPS_CUH
#define CUDA_INTEGER_SHIFT_OPS_CUH
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.cuh"
#include "integer.h"
#include "pbs/bootstrap_low_latency.cuh"
#include "pbs/bootstrap_multibit.cuh"
#include "types/complex/operations.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
template <typename Torus>
__host__ void scratch_cuda_integer_radix_scalar_shift_kb(
cuda_stream_t *stream, int_shift_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, SHIFT_TYPE shift_type,
bool allocate_gpu_memory) {
*mem_ptr = new int_shift_buffer<Torus>(stream, shift_type, params,
num_radix_blocks, allocate_gpu_memory);
}
template <typename Torus>
__host__ void host_integer_radix_scalar_shift_kb_inplace(
cuda_stream_t *stream, Torus *lwe_array, uint32_t shift,
int_shift_buffer<Torus> *mem, void *bsk, Torus *ksk, uint32_t num_blocks) {
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
size_t big_lwe_size = glwe_dimension * polynomial_size + 1;
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
size_t num_bits_in_block = (size_t)log2(message_modulus);
size_t total_num_bits = num_bits_in_block * num_blocks;
shift = shift % total_num_bits;
if (shift == 0) {
return;
}
size_t rotations = std::min(shift / num_bits_in_block, (size_t)num_blocks);
size_t shift_within_block = shift % num_bits_in_block;
Torus *rotated_buffer = mem->tmp_rotated;
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
auto lut_univariate = mem->lut_buffers_univariate[shift_within_block];
// rotate right all the blocks in radix ciphertext
// copy result in new buffer
// 256 threads are used in every block
// block_count blocks will be used in the grid
// one block is responsible to process single lwe ciphertext
if (mem->shift_type == LEFT_SHIFT) {
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
// create trivial assign for value = 0
cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
stream);
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, stream);
if (shift_within_block == 0 || rotations == num_blocks) {
return;
}
// check if we have enough blocks for partial processing
if (rotations < num_blocks - 1) {
auto partial_current_blocks = &lwe_array[(rotations + 1) * big_lwe_size];
auto partial_previous_blocks = &lwe_array[rotations * big_lwe_size];
size_t partial_block_count = num_blocks - rotations - 1;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, partial_current_blocks, partial_current_blocks,
partial_previous_blocks, bsk, ksk, partial_block_count,
lut_bivariate);
}
auto rest = &lwe_array[rotations * big_lwe_size];
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, rest, rest, bsk, ksk, 1, lut_univariate);
} else {
// right shift
radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
// rotate left as the blocks are from LSB to MSB
// create trivial assign for value = 0
cuda_memset_async(rotated_buffer + (num_blocks - rotations) * big_lwe_size,
0, rotations * big_lwe_size_bytes, stream);
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, stream);
if (shift_within_block == 0 || rotations == num_blocks) {
return;
}
// check if we have enough blocks for partial processing
if (rotations < num_blocks - 1) {
auto partial_current_blocks = lwe_array;
auto partial_next_blocks = &lwe_array[big_lwe_size];
size_t partial_block_count = num_blocks - rotations - 1;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, partial_current_blocks, partial_current_blocks,
partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
}
// The right-most block is done separately as it does not
// need to recuperate the shifted bits from its right neighbour.
auto last_block = &lwe_array[(num_blocks - rotations - 1) * big_lwe_size];
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, last_block, last_block, bsk, ksk, 1, lut_univariate);
}
}
#endif // CUDA_SCALAR_OPS_CUH

View File

@@ -0,0 +1,109 @@
#include "linearalgebra/addition.cuh"
/*
* Perform the addition of two u32 input LWE ciphertext vectors.
* See the equivalent operation on u64 ciphertexts for more details.
*/
void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition(stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in_1),
static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the addition of two u64 input LWE ciphertext vectors.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in_1` is the first LWE ciphertext vector used as input, it
* should have been allocated and initialized before calling this function. It
* has the same size as the output array.
* - `lwe_array_in_2` is the second LWE ciphertext vector used as input, it
* should have been allocated and initialized before calling this function. It
* has the same size as the output array.
* - `input_lwe_dimension` is the number of mask elements in the two input and
* in the output ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
* input LWE ciphertext vector, as well as in the output.
*
* Each element (mask element or body) of the input LWE ciphertext vector 1 is
* added to the corresponding element in the input LWE ciphertext 2. The result
* is stored in the output LWE ciphertext vector. The two input LWE ciphertext
* vectors are left unchanged. This function is a wrapper to a device function
* that performs the operation on the GPU.
*/
void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition(stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in_1),
static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the addition of a u32 input LWE ciphertext vector with a u32
* plaintext vector. See the equivalent operation on u64 data for more details.
*/
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext(stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(plaintext_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);
}
/*
* Perform the addition of a u64 input LWE ciphertext vector with a u64 input
* plaintext vector.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
* been allocated and initialized before calling this function. It has the same
* size as the output array.
* - `plaintext_array_in` is the plaintext vector used as input, it should have
* been allocated and initialized before calling this function. It should be of
* size `input_lwe_ciphertext_count`.
* - `input_lwe_dimension` is the number of mask elements in the input and
* output LWE ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
* input LWE ciphertext vector, as well as in the output. It is also the number
* of plaintexts in the input plaintext vector.
*
* Each plaintext of the input plaintext vector is added to the body of the
* corresponding LWE ciphertext in the LWE ciphertext vector. The result of the
* operation is stored in the output LWE ciphertext vector. The two input
* vectors are unchanged. This function is a wrapper to a device function that
* performs the operation on the GPU.
*/
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext(stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(plaintext_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);
}

View File

@@ -0,0 +1,154 @@
#ifndef CUDA_ADD_CUH
#define CUDA_ADD_CUH
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "../utils/kernel_dimensions.cuh"
#include "device.h"
#include "linear_algebra.h"
#include <stdio.h>
template <typename T>
__global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input,
uint32_t input_lwe_dimension,
uint32_t num_entries) {
int tid = threadIdx.x;
int plaintext_index = blockIdx.x * blockDim.x + tid;
if (plaintext_index < num_entries) {
int index =
plaintext_index * (input_lwe_dimension + 1) + input_lwe_dimension;
// Here we take advantage of the wrapping behaviour of uint
output[index] = lwe_input[index] + plaintext_input[plaintext_index];
}
}
template <typename T>
__host__ void host_addition_plaintext(cuda_stream_t *stream, T *output,
T *lwe_input, T *plaintext_input,
uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count) {
cudaSetDevice(stream->gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = lwe_ciphertext_count;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
cuda_memcpy_async_gpu_to_gpu(
output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count, stream);
plaintext_addition<<<grid, thds, 0, stream->stream>>>(
output, lwe_input, plaintext_input, lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
}
template <typename T>
__global__ void addition(T *output, T *input_1, T *input_2,
uint32_t num_entries) {
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + tid;
if (index < num_entries) {
// Here we take advantage of the wrapping behaviour of uint
output[index] = input_1[index] + input_2[index];
}
}
// Coefficient-wise addition
template <typename T>
__host__ void host_addition(cuda_stream_t *stream, T *output, T *input_1,
T *input_2, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(stream->gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count * lwe_size;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
addition<<<grid, thds, 0, stream->stream>>>(output, input_1, input_2,
num_entries);
check_cuda_error(cudaGetLastError());
}
template <typename T>
__global__ void subtraction(T *output, T *input_1, T *input_2,
uint32_t num_entries) {
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + tid;
if (index < num_entries) {
// Here we take advantage of the wrapping behaviour of uint
output[index] = input_1[index] - input_2[index];
}
}
// Coefficient-wise subtraction
template <typename T>
__host__ void host_subtraction(cuda_stream_t *stream, T *output, T *input_1,
T *input_2, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(stream->gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count * lwe_size;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
subtraction<<<grid, thds, 0, stream->stream>>>(output, input_1, input_2,
num_entries);
check_cuda_error(cudaGetLastError());
}
template <typename T>
__global__ void radix_body_subtraction_inplace(T *lwe_ct, T *plaintext_input,
uint32_t input_lwe_dimension,
uint32_t num_entries) {
int tid = threadIdx.x;
int plaintext_index = blockIdx.x * blockDim.x + tid;
if (plaintext_index < num_entries) {
int index =
plaintext_index * (input_lwe_dimension + 1) + input_lwe_dimension;
// Here we take advantage of the wrapping behaviour of uint
lwe_ct[index] -= plaintext_input[plaintext_index];
}
}
template <typename T>
__host__ void host_subtraction_plaintext(cuda_stream_t *stream, T *output,
T *lwe_input, T *plaintext_input,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(stream->gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
input_lwe_ciphertext_count *
(input_lwe_dimension + 1) * sizeof(T),
stream);
radix_body_subtraction_inplace<<<grid, thds, 0, stream->stream>>>(
output, plaintext_input, input_lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
}
#endif // CUDA_ADD_H

View File

@@ -0,0 +1,56 @@
#include "linearalgebra/multiplication.cuh"
/*
* Perform the multiplication of a u32 input LWE ciphertext vector with a u32
* cleartext vector. See the equivalent operation on u64 data for more details.
*/
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_cleartext_multiplication(stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(cleartext_array_in),
input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the multiplication of a u64 input LWE ciphertext vector with a u64
* input cleartext vector.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
* been allocated and initialized before calling this function. It has the same
* size as the output array.
* - `cleartext_array_in` is the cleartext vector used as input, it should have
* been allocated and initialized before calling this function. It should be of
* size `input_lwe_ciphertext_count`.
* - `input_lwe_dimension` is the number of mask elements in the input and
* output LWE ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
* input LWE ciphertext vector, as well as in the output. It is also the number
* of cleartexts in the input cleartext vector.
*
* Each cleartext of the input cleartext vector is multiplied to the mask and
* body of the corresponding LWE ciphertext in the LWE ciphertext vector. The
* result of the operation is stored in the output LWE ciphertext vector. The
* two input vectors are unchanged. This function is a wrapper to a device
* function that performs the operation on the GPU.
*/
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_cleartext_multiplication(stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(cleartext_array_in),
input_lwe_dimension,
input_lwe_ciphertext_count);
}

View File

@@ -0,0 +1,52 @@
#ifndef CUDA_MULT_CUH
#define CUDA_MULT_CUH
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "../utils/kernel_dimensions.cuh"
#include "device.h"
#include "linear_algebra.h"
#include <fstream>
#include <iostream>
#include <vector>
template <typename T>
__global__ void
cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
uint32_t input_lwe_dimension, uint32_t num_entries) {
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + tid;
if (index < num_entries) {
int cleartext_index = index / (input_lwe_dimension + 1);
// Here we take advantage of the wrapping behaviour of uint
output[index] = lwe_input[index] * cleartext_input[cleartext_index];
}
}
template <typename T>
__host__ void
host_cleartext_multiplication(cuda_stream_t *stream, T *output, T *lwe_input,
T *cleartext_input, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(stream->gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count * lwe_size;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
cleartext_multiplication<<<grid, thds, 0, stream->stream>>>(
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
}
#endif // CUDA_MULT_H

View File

@@ -0,0 +1,49 @@
#include "linearalgebra/negation.cuh"
/*
* Perform the negation of a u32 input LWE ciphertext vector.
* See the equivalent operation on u64 ciphertexts for more details.
*/
void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_negation(stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the negation of a u64 input LWE ciphertext vector.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
* been allocated and initialized before calling this function. It has the same
* size as the output array.
* - `input_lwe_dimension` is the number of mask elements in the two input and
* in the output ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
* input LWE ciphertext vector, as well as in the output.
*
* Each element (mask element or body) of the input LWE ciphertext vector is
* negated. The result is stored in the output LWE ciphertext vector. The input
* LWE ciphertext vector is left unchanged. This function is a wrapper to a
* device function that performs the operation on the GPU.
*/
void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_negation(stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}

View File

@@ -0,0 +1,44 @@
#ifndef CUDA_NEGATE_CUH
#define CUDA_NEGATE_CUH
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "../utils/kernel_dimensions.cuh"
#include "device.h"
#include "linear_algebra.h"
template <typename T>
__global__ void negation(T *output, T *input, uint32_t num_entries) {
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + tid;
if (index < num_entries) {
// Here we take advantage of the wrapping behaviour of uint
output[index] = -input[index];
}
}
template <typename T>
__host__ void host_negation(cuda_stream_t *stream, T *output, T *input,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(stream->gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count * lwe_size;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
negation<<<grid, thds, 0, stream->stream>>>(output, input, num_entries);
check_cuda_error(cudaGetLastError());
}
#endif // CUDA_NEGATE_H

View File

@@ -0,0 +1 @@
#include "bootstrapping_key.cuh"

View File

@@ -0,0 +1,377 @@
#include "bootstrap_amortized.cuh"
/*
* Returns the buffer size for 64 bits executions
*/
uint64_t get_buffer_size_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
return get_buffer_size_bootstrap_amortized<uint64_t>(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory);
}
/*
* Runs standard checks to validate the inputs
*/
void checks_fast_bootstrap_amortized(int polynomial_size) {
assert(
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192, 16384",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192 ||
polynomial_size == 16384));
}
/*
* Runs standard checks to validate the inputs
*/
void checks_bootstrap_amortized(int nbits, int base_log, int polynomial_size) {
assert(("Error (GPU amortized PBS): base log should be <= nbits",
base_log <= nbits));
checks_fast_bootstrap_amortized(polynomial_size);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the amortized PBS on 32 bits inputs, into `pbs_buffer`. It also
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
* be used.
*/
void scratch_cuda_bootstrap_amortized_32(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
checks_fast_bootstrap_amortized(polynomial_size);
switch (polynomial_size) {
case 256:
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<256>>(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<512>>(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<1024>>(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<2048>>(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<4096>>(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<8192>>(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 16384:
scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<16384>>(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the amortized PBS on 64 bits inputs, into `pbs_buffer`. It also
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
* be used.
*/
void scratch_cuda_bootstrap_amortized_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
checks_fast_bootstrap_amortized(polynomial_size);
switch (polynomial_size) {
case 256:
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<256>>(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<512>>(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<1024>>(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<2048>>(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<4096>>(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<8192>>(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 16384:
scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<16384>>(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/* Perform the programmable bootstrapping on a batch of input u32 LWE
* ciphertexts. See the corresponding operation on 64 bits for more details.
*/
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
checks_bootstrap_amortized(32, base_log, polynomial_size);
switch (polynomial_size) {
case 256:
host_bootstrap_amortized<uint32_t, AmortizedDegree<256>>(
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 512:
host_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 1024:
host_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 2048:
host_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 4096:
host_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 8192:
host_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 16384:
host_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
default:
break;
}
}
/* Perform the programmable bootstrapping on a batch of input u64 LWE
* ciphertexts. This functions performs best for large numbers of inputs (> 10).
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to
* which test vector of lut_vector to use for each LWE input in
* lwe_array_in
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: GGSW encryption of the LWE secret key sk1
* under secret key sk2
* bsk = Z + sk1 H
* where H is the gadget matrix and Z is a matrix (k+1).l
* containing GLWE encryptions of 0 under sk2.
* bsk is thus a tensor of size (k+1)^2.l.N.n
* where l is the number of decomposition levels and
* k is the GLWE dimension, N is the polynomial size for
* GLWE. The polynomial size for GLWE and the test vector
* are the same because they have to be in the same ring
* to be multiplied.
* - input_lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - polynomial_size: size of the test polynomial (test vector) and size of the
* GLWE polynomials (~1024) (where `size` refers to the polynomial degree + 1).
* - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - num_samples: number of encrypted input messages
* - num_lut_vectors: parameter to set the actual number of test vectors to be
* used
* - lwe_idx: the index of the LWE input to consider for the GPU of index
* gpu_index. In case of multi-GPU computing, it is assumed that only a part of
* the input LWE array is copied to each GPU, but the whole LUT array is copied
* (because the case when the number of LUTs is smaller than the number of input
* LWEs is not trivial to take into account in the data repartition on the
* GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
* input in the LUT array `lut_vector`.
* - 'max_shared_memory' maximum amount of shared memory to be used inside
* device functions
*
* This function calls a wrapper to a device kernel that performs the
* bootstrapping:
* - the kernel is templatized based on integer discretization and
* polynomial degree
* - num_samples blocks of threads are launched, where each thread is going
* to handle one or more polynomial coefficients at each stage:
* - perform the blind rotation
* - round the result
* - decompose into level_count levels, then for each level:
* - switch to the FFT domain
* - multiply with the bootstrapping key
* - come back to the coefficients representation
* - between each stage a synchronization of the threads is necessary
* - in case the device has enough shared memory, temporary arrays used for
* the different stages (accumulators) are stored into the shared memory
* - the accumulators serve to combine the results for all decomposition
* levels
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
*/
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
checks_bootstrap_amortized(64, base_log, polynomial_size);
switch (polynomial_size) {
case 256:
host_bootstrap_amortized<uint64_t, AmortizedDegree<256>>(
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 512:
host_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 1024:
host_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 2048:
host_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 4096:
host_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 8192:
host_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 16384:
host_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
default:
break;
}
}
/*
* This cleanup function frees the data for the amortized PBS on GPU in
* pbs_buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
int8_t **pbs_buffer) {
// Free memory
cuda_drop_async(*pbs_buffer, stream);
}

View File

@@ -0,0 +1,363 @@
#ifndef CUDA_AMORTIZED_PBS_CUH
#define CUDA_AMORTIZED_PBS_CUH
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "bootstrap.h"
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "types/complex/operations.cuh"
template <typename Torus, class params, sharedMemDegree SMD>
/*
* Kernel launched by host_bootstrap_amortized
*
* Uses shared memory to increase performance
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to which test vector
* to use for each sample in lut_vector
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1 under secret
* key sk2
* - device_mem: pointer to the device's global memory in case we use it (SMD
* == NOSM or PARTIALSM)
* - lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - polynomial_size: size of the test polynomial (test vector) and size of the
* GLWE polynomial (~1024)
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - gpu_num: index of the current GPU (useful for multi-GPU computations)
* - lwe_idx: equal to the number of samples per gpu x gpu_num
* - device_memory_size_per_sample: amount of global memory to allocate if SMD
* is not FULLSM
*/
__global__ void device_bootstrap_amortized(
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
double2 *bootstrapping_key, int8_t *device_mem, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t lwe_idx,
size_t device_memory_size_per_sample) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
if constexpr (SMD == FULLSM)
selected_memory = sharedmem;
else
selected_memory = &device_mem[blockIdx.x * device_memory_size_per_sample];
// For GPU bootstrapping the GLWE dimension is hard-set to 1: there is only
// one mask polynomial and 1 body to handle.
Torus *accumulator = (Torus *)selected_memory;
Torus *accumulator_rotated =
(Torus *)accumulator +
(ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
double2 *res_fft =
(double2 *)accumulator_rotated + (glwe_dimension + 1) * polynomial_size /
(sizeof(double2) / sizeof(Torus));
double2 *accumulator_fft = (double2 *)sharedmem;
if constexpr (SMD != PARTIALSM)
accumulator_fft = (double2 *)res_fft +
(ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);
auto block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
Torus *block_lut_vector =
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
(glwe_dimension + 1)];
// Put "b", the body, in [0, 2N[
Torus b_hat = 0;
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
2 * params::degree); // 2 * params::log2_degree + 1);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, block_lut_vector, b_hat, false, glwe_dimension + 1);
// Loop over all the mask elements of the sample to accumulate
// (X^a_i-1) multiplication, decomposition of the resulting polynomial
// into level_count polynomials, and performing polynomial multiplication
// via an FFT with the RGSW encrypted secret key
for (int iteration = 0; iteration < lwe_dimension; iteration++) {
synchronize_threads_in_block();
// Put "a" in [0, 2N[ instead of Zq
Torus a_hat = 0;
rescale_torus_element(block_lwe_array_in[iteration], a_hat,
2 * params::degree); // 2 * params::log2_degree + 1);
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat, glwe_dimension + 1);
synchronize_threads_in_block();
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, level_count, glwe_dimension + 1);
// Initialize the polynomial multiplication via FFT arrays
// The polynomial multiplications happens at the block level
// and each thread handles two or more coefficients
int pos = threadIdx.x;
for (int i = 0; i < (glwe_dimension + 1); i++)
for (int j = 0; j < params::opt / 2; j++) {
res_fft[pos].x = 0;
res_fft[pos].y = 0;
pos += params::degree / params::opt;
}
GadgetMatrix<Torus, params> gadget(base_log, level_count,
accumulator_rotated, glwe_dimension + 1);
// Now that the rotation is done, decompose the resulting polynomial
// coefficients so as to multiply each decomposed level with the
// corresponding part of the bootstrapping key
for (int level = level_count - 1; level >= 0; level--) {
for (int i = 0; i < (glwe_dimension + 1); i++) {
gadget.decompose_and_compress_next_polynomial(accumulator_fft, i);
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
auto bsk_slice = get_ith_mask_kth_block(bootstrapping_key, iteration, i,
level, polynomial_size,
glwe_dimension, level_count);
// Perform the coefficient-wise product with the two pieces of
// bootstrapping key
for (int j = 0; j < (glwe_dimension + 1); j++) {
auto bsk_poly = bsk_slice + j * params::degree / 2;
auto res_fft_poly = res_fft + j * params::degree / 2;
polynomial_product_accumulate_in_fourier_domain<params, double2>(
res_fft_poly, accumulator_fft, bsk_poly);
}
}
synchronize_threads_in_block();
}
// Come back to the coefficient representation
if constexpr (SMD == FULLSM || SMD == NOSM) {
synchronize_threads_in_block();
for (int i = 0; i < (glwe_dimension + 1); i++) {
auto res_fft_slice = res_fft + i * params::degree / 2;
NSMFFT_inverse<HalfDegree<params>>(res_fft_slice);
}
synchronize_threads_in_block();
for (int i = 0; i < (glwe_dimension + 1); i++) {
auto accumulator_slice = accumulator + i * params::degree;
auto res_fft_slice = res_fft + i * params::degree / 2;
add_to_torus<Torus, params>(res_fft_slice, accumulator_slice);
}
synchronize_threads_in_block();
} else {
#pragma unroll
for (int i = 0; i < (glwe_dimension + 1); i++) {
auto accumulator_slice = accumulator + i * params::degree;
auto res_fft_slice = res_fft + i * params::degree / 2;
int tid = threadIdx.x;
for (int j = 0; j < params::opt / 2; j++) {
accumulator_fft[tid] = res_fft_slice[tid];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(accumulator_fft, accumulator_slice);
}
synchronize_threads_in_block();
}
}
auto block_lwe_array_out =
&lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1)];
// The blind rotation for this block is over
// Now we can perform the sample extraction: for the body it's just
// the resulting constant coefficient of the accumulator
// For the mask it's more complicated
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
glwe_dimension);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
glwe_dimension);
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_full_sm_bootstrap_amortized(
uint32_t polynomial_size, uint32_t glwe_dimension) {
return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
sizeof(Torus) * polynomial_size *
(glwe_dimension + 1) + // accumulator rotated
sizeof(double2) * polynomial_size / 2 + // accumulator fft
sizeof(double2) * polynomial_size / 2 *
(glwe_dimension + 1); // res fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_bootstrap_amortized(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_bootstrap_amortized(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count;
} else if (max_shared_memory < full_sm) {
device_mem = partial_dm * input_lwe_ciphertext_count;
}
return device_mem + device_mem % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_bootstrap_amortized(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
partial_sm);
cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
check_cuda_error(cudaFuncSetCacheConfig(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncCachePreferShared));
}
if (allocate_gpu_memory) {
uint64_t buffer_size = get_buffer_size_bootstrap_amortized<Torus>(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
check_cuda_error(cudaGetLastError());
}
}
template <typename Torus, class params>
__host__ void host_bootstrap_amortized(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory) {
cudaSetDevice(stream->gpu_index);
uint64_t SM_FULL = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
uint64_t SM_PART =
get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
uint64_t DM_PART = SM_FULL - SM_PART;
uint64_t DM_FULL = SM_FULL;
// Create a 1-dimensional grid of threads
// where each block handles 1 sample and each thread
// handles opt polynomial coefficients
// (actually opt/2 coefficients since we compress the real polynomial into a
// complex)
dim3 grid(input_lwe_ciphertext_count, 1, 1);
dim3 thds(polynomial_size / params::opt, 1, 1);
// Launch the kernel using polynomial_size/opt threads
// where each thread computes opt polynomial coefficients
// Depending on the required amount of shared memory, choose
// from one of three templates (no use, partial use or full use
// of shared memory)
if (max_shared_memory < SM_PART) {
device_bootstrap_amortized<Torus, params, NOSM>
<<<grid, thds, 0, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, lwe_idx, DM_FULL);
} else if (max_shared_memory < SM_FULL) {
device_bootstrap_amortized<Torus, params, PARTIALSM>
<<<grid, thds, SM_PART, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, lwe_idx, DM_PART);
} else {
// For devices with compute capability 7.x a single thread block can
// address the full capacity of shared memory. Shared memory on the
// device then has to be allocated dynamically.
// For lower compute capabilities, this call
// just does nothing and the amount of shared memory used is 48 KB
device_bootstrap_amortized<Torus, params, FULLSM>
<<<grid, thds, SM_FULL, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, lwe_idx, 0);
}
check_cuda_error(cudaGetLastError());
}
template <typename Torus, class params>
int cuda_get_pbs_per_gpu(int polynomial_size) {
int blocks_per_sm = 0;
int num_threads = polynomial_size / params::opt;
cudaGetDeviceCount(0);
cudaDeviceProp device_properties;
cudaGetDeviceProperties(&device_properties, 0);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&blocks_per_sm, device_bootstrap_amortized<Torus, params>, num_threads,
0);
return device_properties.multiProcessorCount * blocks_per_sm;
}
#endif // CNCRT_PBS_H

View File

@@ -0,0 +1,453 @@
#ifndef CUDA_FAST_LOWLAT_PBS_CUH
#define CUDA_FAST_LOWLAT_PBS_CUH
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "cooperative_groups.h"
#include "bootstrap.h"
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "types/complex/operations.cuh"
// Cooperative groups are used in the low latency PBS
using namespace cooperative_groups;
namespace cg = cooperative_groups;
template <typename Torus, class params>
__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
double2 *join_buffer, double2 *bootstrapping_key,
int polynomial_size, uint32_t glwe_dimension,
int level_count, int iteration,
grid_group &grid) {
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(fft);
synchronize_threads_in_block();
// Get the pieces of the bootstrapping key that will be needed for the
// external product; blockIdx.x is the ID of the block that's executing
// this function, so we end up getting the lines of the bootstrapping key
// needed to perform the external product in this block (corresponding to
// the same decomposition level)
auto bsk_slice = get_ith_mask_kth_block(
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
glwe_dimension, level_count);
// Selects all GLWEs in a particular decomposition level
auto level_join_buffer =
join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2;
// Perform the matrix multiplication between the GGSW and the GLWE,
// each block operating on a single level for mask and body
// The first product is used to initialize level_join_buffer
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2;
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
grid.sync();
// Continues multiplying fft by every polynomial in that particular bsk level
// Each y-block accumulates in a different polynomial at each iteration
for (int j = 1; j < (glwe_dimension + 1); j++) {
int idx = (j + blockIdx.y) % (glwe_dimension + 1);
auto bsk_poly = bsk_slice + idx * params::degree / 2;
auto buffer_slice = level_join_buffer + idx * params::degree / 2;
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
grid.sync();
}
// -----------------------------------------------------------------
// All blocks are synchronized here; after this sync, level_join_buffer has
// the values needed from every other block
auto src_acc = join_buffer + blockIdx.y * params::degree / 2;
// copy first product into fft buffer
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = src_acc[tid];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// accumulate rest of the products into fft buffer
for (int l = 1; l < gridDim.x; l++) {
auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2];
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] += cur_src_acc[tid];
tid += params::degree / params::opt;
}
}
synchronize_threads_in_block();
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
// accumulator
NSMFFT_inverse<HalfDegree<params>>(fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(fft, accumulator);
__syncthreads();
}
/*
* Kernel launched by the low latency version of the
* bootstrapping, that uses cooperative groups
*
* - lwe_array_out: vector of output lwe s, with length
* (glwe_dimension * polynomial_size+1)*num_samples
* - lut_vector: vector of look up tables with
* length (glwe_dimension+1) * polynomial_size * num_samples
* - lut_vector_indexes: mapping between lwe_array_in and lut_vector
* lwe_array_in: vector of lwe inputs with length (lwe_dimension + 1) *
* num_samples
*
* Each y-block computes one element of the lwe_array_out.
*/
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_bootstrap_fast_low_latency(
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
int8_t *device_mem, uint64_t device_memory_size_per_block) {
grid_group grid = this_grid();
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
uint32_t glwe_dimension = gridDim.y - 1;
if constexpr (SMD == FULLSM) {
selected_memory = sharedmem;
} else {
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
blockIdx.z * gridDim.x * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
// We always compute the pointer with most restrictive alignment to avoid
// alignment issues
double2 *accumulator_fft = (double2 *)selected_memory;
Torus *accumulator =
(Torus *)accumulator_fft +
(ptrdiff_t)(sizeof(double2) * polynomial_size / 2 / sizeof(Torus));
Torus *accumulator_rotated =
(Torus *)accumulator + (ptrdiff_t)polynomial_size;
if constexpr (SMD == PARTIALSM)
accumulator_fft = (double2 *)sharedmem;
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
params::degree * (glwe_dimension + 1)];
double2 *block_join_buffer =
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
params::degree / 2];
// Since the space is L1 cache is small, we use the same memory location for
// the rotated accumulator and the fft accumulator, since we know that the
// rotated array is not in use anymore by the time we perform the fft
// Put "b" in [0, 2N[
Torus b_hat = 0;
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
2 * params::degree);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
for (int i = 0; i < lwe_dimension; i++) {
synchronize_threads_in_block();
// Put "a" in [0, 2N[
Torus a_hat = 0;
rescale_torus_element(block_lwe_array_in[i], a_hat,
2 * params::degree); // 2 * params::log2_degree + 1);
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, level_count);
synchronize_threads_in_block();
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
accumulator_rotated);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, params>(
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
polynomial_size, glwe_dimension, level_count, i, grid);
synchronize_threads_in_block();
}
auto block_lwe_array_out =
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result, but
// we do the computation at block 0 to avoid waiting for extra blocks, in
// case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
}
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_fast_low_latency(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_bootstrap_fast_low_latency(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_bootstrap_fast_low_latency(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm) {
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2);
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_bootstrap_fast_low_latency(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_fast_low_latency<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
cudaFuncSetCacheConfig(
device_bootstrap_fast_low_latency<Torus, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
if (allocate_gpu_memory) {
uint64_t buffer_size = get_buffer_size_bootstrap_fast_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
check_cuda_error(cudaGetLastError());
}
}
/*
* Host wrapper to the low latency version
* of bootstrapping
*/
template <typename Torus, class params>
__host__ void host_bootstrap_fast_low_latency(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t max_shared_memory) {
cudaSetDevice(stream->gpu_index);
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t full_dm = full_sm;
uint64_t partial_dm = full_dm - partial_sm;
int8_t *d_mem = pbs_buffer;
double2 *buffer_fft =
(double2 *)d_mem +
(ptrdiff_t)(get_buffer_size_bootstrap_fast_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory) /
sizeof(double2) -
(glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count * polynomial_size / 2);
int thds = polynomial_size / params::opt;
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
void *kernel_args[14];
kernel_args[0] = &lwe_array_out;
kernel_args[1] = &lwe_output_indexes;
kernel_args[2] = &lut_vector;
kernel_args[3] = &lut_vector_indexes;
kernel_args[4] = &lwe_array_in;
kernel_args[5] = &lwe_input_indexes;
kernel_args[6] = &bootstrapping_key;
kernel_args[7] = &buffer_fft;
kernel_args[8] = &lwe_dimension;
kernel_args[9] = &polynomial_size;
kernel_args[10] = &base_log;
kernel_args[11] = &level_count;
kernel_args[12] = &d_mem;
if (max_shared_memory < partial_sm) {
kernel_args[13] = &full_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_bootstrap_fast_low_latency<Torus, params, NOSM>, grid,
thds, (void **)kernel_args, 0, stream->stream));
} else if (max_shared_memory < full_sm) {
kernel_args[13] = &partial_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
grid, thds, (void **)kernel_args, partial_sm, stream->stream));
} else {
int no_dm = 0;
kernel_args[13] = &no_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_bootstrap_fast_low_latency<Torus, params, FULLSM>, grid,
thds, (void **)kernel_args, full_sm, stream->stream));
}
check_cuda_error(cudaGetLastError());
}
// Verify if the grid size for the low latency kernel satisfies the cooperative
// group constraints
template <typename Torus, class params>
__host__ bool verify_cuda_bootstrap_fast_low_latency_grid_size(
int glwe_dimension, int level_count, int num_samples,
uint32_t max_shared_memory) {
// If Cooperative Groups is not supported, no need to check anything else
if (!cuda_check_support_cooperative_groups())
return false;
// Calculate the dimension of the kernel
uint64_t full_sm =
get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(params::degree);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
params::degree);
int thds = params::degree / params::opt;
// Get the maximum number of active blocks per streaming multiprocessors
int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
int max_active_blocks_per_sm;
if (max_shared_memory < partial_sm) {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_bootstrap_fast_low_latency<Torus, params, NOSM>, thds,
0);
} else if (max_shared_memory < full_sm) {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
thds, 0);
} else {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_bootstrap_fast_low_latency<Torus, params, FULLSM>, thds,
0);
}
// Get the number of streaming multiprocessors
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}
#endif // LOWLAT_FAST_PBS_H

View File

@@ -0,0 +1,321 @@
#ifndef CUDA_FAST_MULTIBIT_PBS_CUH
#define CUDA_FAST_MULTIBIT_PBS_CUH
#include "bootstrap.h"
#include "bootstrap_multibit.cuh"
#include "bootstrap_multibit.h"
#include "cooperative_groups.h"
#include "crypto/gadget.cuh"
#include "crypto/ggsw.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "types/complex/operations.cuh"
#include <vector>
template <typename Torus, class params>
__global__ void device_multi_bit_bootstrap_fast_accumulate(
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
double2 *keybundle_array, double2 *join_buffer, Torus *global_accumulator,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t grouping_factor,
uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input) {
grid_group grid = this_grid();
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
selected_memory = sharedmem;
// We always compute the pointer with most restrictive alignment to avoid
// alignment issues
double2 *accumulator_fft = (double2 *)selected_memory;
Torus *accumulator =
(Torus *)accumulator_fft +
(ptrdiff_t)(sizeof(double2) * polynomial_size / 2 / sizeof(Torus));
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
params::degree * (glwe_dimension + 1)];
double2 *block_join_buffer =
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
params::degree / 2];
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
double2 *keybundle = keybundle_array +
// select the input
blockIdx.z * keybundle_size_per_input;
if (lwe_offset == 0) {
// Put "b" in [0, 2N[
Torus b_hat = 0;
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
2 * params::degree);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
global_slice, accumulator);
}
for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, params>(accumulator, accumulator_fft,
block_join_buffer, keybundle, polynomial_size,
glwe_dimension, level_count, i, grid);
synchronize_threads_in_block();
}
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
auto block_lwe_array_out =
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
}
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
accumulator, global_slice);
}
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_fast_multibit_bootstrap(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size * 2; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_fast_multibit_bootstrap(
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t grouping_factor, uint32_t lwe_chunk_size,
uint32_t max_shared_memory) {
uint64_t buffer_size = 0;
buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
(glwe_dimension + 1) * (glwe_dimension + 1) *
(polynomial_size / 2) * sizeof(double2); // keybundle fft
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
level_count * (polynomial_size / 2) *
sizeof(double2); // join buffer
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
polynomial_size * sizeof(Torus); // global_accumulator
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_fast_multi_bit_pbs(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0) {
cudaSetDevice(stream->gpu_index);
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
polynomial_size);
uint64_t full_sm_accumulate =
get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(polynomial_size);
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_bootstrap_keybundle<Torus, params>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
cudaFuncSetCacheConfig(device_multi_bit_bootstrap_keybundle<Torus, params>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
if (allocate_gpu_memory) {
if (!lwe_chunk_size)
lwe_chunk_size = get_average_lwe_chunk_size(lwe_dimension, level_count,
glwe_dimension);
uint64_t buffer_size = get_buffer_size_fast_multibit_bootstrap<Torus>(
lwe_dimension, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, lwe_chunk_size,
max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
check_cuda_error(cudaGetLastError());
}
}
template <typename Torus, typename STorus, class params>
__host__ void host_fast_multi_bit_pbs(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, uint64_t *bootstrapping_key, int8_t *pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
cudaSetDevice(stream->gpu_index);
if (!lwe_chunk_size)
lwe_chunk_size =
get_average_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension);
//
double2 *keybundle_fft = (double2 *)pbs_buffer;
double2 *buffer_fft = (double2 *)keybundle_fft +
num_samples * lwe_chunk_size * level_count *
(glwe_dimension + 1) * (glwe_dimension + 1) *
(polynomial_size / 2);
Torus *global_accumulator =
(Torus *)buffer_fft +
(ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
level_count * (polynomial_size / 2) / sizeof(Torus));
//
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
polynomial_size);
uint64_t full_sm_accumulate =
get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(polynomial_size);
uint32_t keybundle_size_per_input =
lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2);
//
void *kernel_args[18];
kernel_args[0] = &lwe_array_out;
kernel_args[1] = &lwe_output_indexes;
kernel_args[2] = &lut_vector;
kernel_args[3] = &lut_vector_indexes;
kernel_args[4] = &lwe_array_in;
kernel_args[5] = &lwe_input_indexes;
kernel_args[6] = &keybundle_fft;
kernel_args[7] = &buffer_fft;
kernel_args[8] = &global_accumulator;
kernel_args[9] = &lwe_dimension;
kernel_args[10] = &glwe_dimension;
kernel_args[11] = &polynomial_size;
kernel_args[12] = &base_log;
kernel_args[13] = &level_count;
kernel_args[14] = &grouping_factor;
kernel_args[17] = &keybundle_size_per_input;
//
dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
dim3 thds(polynomial_size / params::opt, 1, 1);
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
uint32_t chunk_size = std::min(
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
// Compute a keybundle
dim3 grid_keybundle(num_samples * chunk_size,
(glwe_dimension + 1) * (glwe_dimension + 1),
level_count);
device_multi_bit_bootstrap_keybundle<Torus, params>
<<<grid_keybundle, thds, full_sm_keybundle, stream->stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, lwe_offset, chunk_size,
keybundle_size_per_input);
check_cuda_error(cudaGetLastError());
kernel_args[15] = &lwe_offset;
kernel_args[16] = &chunk_size;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
grid_accumulate, thds, (void **)kernel_args, full_sm_accumulate,
stream->stream));
}
}
// Verify if the grid size for the low latency kernel satisfies the cooperative
// group constraints
template <typename Torus, class params>
__host__ bool
verify_cuda_bootstrap_fast_multi_bit_grid_size(int glwe_dimension,
int level_count, int num_samples,
uint32_t max_shared_memory) {
// If Cooperative Groups is not supported, no need to check anything else
if (!cuda_check_support_cooperative_groups())
return false;
// Calculate the dimension of the kernel
uint64_t full_sm =
get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(params::degree);
int thds = params::degree / params::opt;
// Get the maximum number of active blocks per streaming multiprocessors
int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
int max_active_blocks_per_sm;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_multi_bit_bootstrap_fast_accumulate<Torus, params>, thds,
full_sm);
// Get the number of streaming multiprocessors
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}
#endif // FASTMULTIBIT_PBS_H

View File

@@ -0,0 +1,845 @@
#include "bootstrap_fast_low_latency.cuh"
#include "bootstrap_low_latency.cuh"
/*
* Returns the buffer size for 64 bits executions
*/
uint64_t get_buffer_size_bootstrap_low_latency_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
else
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
break;
case 512:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
else
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
break;
case 1024:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<1024>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
else
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
break;
case 2048:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
else
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
break;
case 4096:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
else
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
break;
case 8192:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
else
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
break;
case 16384:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
input_lwe_ciphertext_count,
max_shared_memory))
return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
else
return get_buffer_size_bootstrap_low_latency<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
break;
default:
return 0;
break;
}
}
/*
* Runs standard checks to validate the inputs
*/
void checks_fast_bootstrap_low_latency(int glwe_dimension, int level_count,
int polynomial_size, int num_samples) {
assert((
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192, 16384",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192 ||
polynomial_size == 16384));
}
/*
* Runs standard checks to validate the inputs
*/
void checks_bootstrap_low_latency(int nbits, int glwe_dimension,
int level_count, int base_log,
int polynomial_size, int num_samples) {
assert(("Error (GPU low latency PBS): base log should be <= nbits",
base_log <= nbits));
checks_fast_bootstrap_low_latency(glwe_dimension, level_count,
polynomial_size, num_samples);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the low latency PBS on 32 bits inputs, into `pbs_buffer`. It also
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
* be used.
*/
void scratch_cuda_bootstrap_low_latency_32(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_bootstrap_low_latency(
glwe_dimension, level_count, polynomial_size, input_lwe_ciphertext_count);
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
AmortizedDegree<256>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<256>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 512:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
AmortizedDegree<512>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<512>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
AmortizedDegree<2048>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<2048>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
AmortizedDegree<4096>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<4096>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
AmortizedDegree<8192>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<8192>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 16384:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
uint32_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
AmortizedDegree<16384>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<16384>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the low_latency PBS on 64 bits inputs, into `pbs_buffer`. It also
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
* be used.
*/
void scratch_cuda_bootstrap_low_latency_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
checks_fast_bootstrap_low_latency(
glwe_dimension, level_count, polynomial_size, input_lwe_ciphertext_count);
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
AmortizedDegree<256>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<256>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 512:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
AmortizedDegree<512>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<512>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<1024>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
AmortizedDegree<1024>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<1024>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
AmortizedDegree<2048>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<2048>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
AmortizedDegree<4096>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<4096>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
AmortizedDegree<8192>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<8192>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 16384:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
input_lwe_ciphertext_count,
max_shared_memory))
scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
AmortizedDegree<16384>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<16384>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
}
/* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
* This function performs best for small numbers of inputs. Beyond a certain
* number of inputs (the exact number depends on the cryptographic parameters),
* the kernel cannot be launched and it is necessary to split the kernel call
* into several calls on smaller batches of inputs. For more details on this
* operation, head on to the equivalent u64 operation.
*/
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
checks_bootstrap_low_latency(32, glwe_dimension, level_count, base_log,
polynomial_size, num_samples);
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<256>>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint32_t, Degree<256>>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
break;
case 512:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<512>>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint32_t, Degree<512>>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
break;
case 1024:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<1024>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<1024>>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint32_t, Degree<1024>>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
break;
case 2048:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<2048>>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint32_t, Degree<2048>>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
break;
case 4096:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<4096>>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint32_t, Degree<4096>>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
break;
case 8192:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<8192>>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint32_t, Degree<8192>>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
break;
case 16384:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
uint32_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<16384>>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint32_t, Degree<16384>>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
break;
default:
break;
}
}
/* Perform bootstrapping on a batch of input u64 LWE ciphertexts.
* This function performs best for small numbers of inputs. Beyond a certain
* number of inputs (the exact number depends on the cryptographic parameters),
* the kernel cannot be launched and it is necessary to split the kernel call
* into several calls on smaller batches of inputs.
*
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to
* which test vector to use for each sample in
* lut_vector
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: GGSW encryption of the LWE secret key sk1
* under secret key sk2
* bsk = Z + sk1 H
* where H is the gadget matrix and Z is a matrix (k+1).l
* containing GLWE encryptions of 0 under sk2.
* bsk is thus a tensor of size (k+1)^2.l.N.n
* where l is the number of decomposition levels and
* k is the GLWE dimension, N is the polynomial size for
* GLWE. The polynomial size for GLWE and the test vector
* are the same because they have to be in the same ring
* to be multiplied.
* - lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - glwe_dimension: size of the polynomial vector used to encrypt the LUT
* GLWE ciphertexts - referred to as k above. Only the value 1 is supported for
* this parameter.
* - polynomial_size: size of the test polynomial (test vector) and size of the
* GLWE polynomial (~1024)
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - num_samples: number of encrypted input messages
* - num_lut_vectors: parameter to set the actual number of test vectors to be
* used
* - lwe_idx: the index of the LWE input to consider for the GPU of index
* gpu_index. In case of multi-GPU computing, it is assumed that only a part of
* the input LWE array is copied to each GPU, but the whole LUT array is copied
* (because the case when the number of LUTs is smaller than the number of input
* LWEs is not trivial to take into account in the data repartition on the
* GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
* input in the LUT array `lut_vector`.
* - 'max_shared_memory' maximum amount of shared memory to be used inside
* device functions
*
* This function calls a wrapper to a device kernel that performs the
* bootstrapping:
* - the kernel is templatized based on integer discretization and
* polynomial degree
* - num_samples * level_count * (glwe_dimension + 1) blocks of threads are
* launched, where each thread is going to handle one or more polynomial
* coefficients at each stage, for a given level of decomposition, either for
* the LUT mask or its body:
* - perform the blind rotation
* - round the result
* - get the decomposition for the current level
* - switch to the FFT domain
* - multiply with the bootstrapping key
* - come back to the coefficients representation
* - between each stage a synchronization of the threads is necessary (some
* synchronizations happen at the block level, some happen between blocks, using
* cooperative groups).
* - in case the device has enough shared memory, temporary arrays used for
* the different stages (accumulators) are stored into the shared memory
* - the accumulators serve to combine the results for all decomposition
* levels
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
*/
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
checks_bootstrap_low_latency(64, glwe_dimension, level_count, base_log,
polynomial_size, num_samples);
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<256>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint64_t, Degree<256>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
break;
case 512:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<512>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint64_t, Degree<512>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
break;
case 1024:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<1024>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<1024>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint64_t, Degree<1024>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
break;
case 2048:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<2048>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint64_t, Degree<2048>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
break;
case 4096:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<4096>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint64_t, Degree<4096>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
break;
case 8192:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<8192>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint64_t, Degree<8192>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
break;
case 16384:
if (verify_cuda_bootstrap_fast_low_latency_grid_size<
uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
num_samples, max_shared_memory))
host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<16384>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
else
host_bootstrap_low_latency<uint64_t, Degree<16384>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_lut_vectors, max_shared_memory);
default:
break;
}
}
/*
* This cleanup function frees the data for the low latency PBS on GPU in
* pbs_buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
int8_t **pbs_buffer) {
// Free memory
cuda_drop_async(*pbs_buffer, stream);
}

View File

@@ -0,0 +1,487 @@
#ifndef CUDA_LOWLAT_PBS_CUH
#define CUDA_LOWLAT_PBS_CUH
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "bootstrap.h"
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "types/complex/operations.cuh"
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_bootstrap_low_latency_step_one(
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
uint32_t glwe_dimension = gridDim.y - 1;
if constexpr (SMD == FULLSM) {
selected_memory = sharedmem;
} else {
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
blockIdx.z * gridDim.x * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
Torus *accumulator = (Torus *)selected_memory;
double2 *accumulator_fft =
(double2 *)accumulator +
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
if constexpr (SMD == PARTIALSM)
accumulator_fft = (double2 *)sharedmem;
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
params::degree * (glwe_dimension + 1)];
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
double2 *global_fft_slice =
global_accumulator_fft +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
blockIdx.z * level_count * (glwe_dimension + 1)) *
(polynomial_size / 2);
if (lwe_iteration == 0) {
// First iteration
// Put "b" in [0, 2N[
Torus b_hat = 0;
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
2 * params::degree);
// The y-dimension is used to select the element of the GLWE this block will
// compute
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
// Persist
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
global_slice[tid] = accumulator[tid];
tid += params::degree / params::opt;
}
}
// Put "a" in [0, 2N[
Torus a_hat = 0;
rescale_torus_element(block_lwe_array_in[lwe_iteration], a_hat,
2 * params::degree); // 2 * params::log2_degree + 1);
synchronize_threads_in_block();
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(global_slice,
accumulator, a_hat);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
synchronize_threads_in_block();
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
global_fft_slice[tid] = accumulator_fft[tid];
tid += params::degree / params::opt;
}
}
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_bootstrap_low_latency_step_two(
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
Torus *lut_vector_indexes, double2 *bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
uint32_t glwe_dimension = gridDim.y - 1;
if constexpr (SMD == FULLSM) {
selected_memory = sharedmem;
} else {
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
blockIdx.z * gridDim.x * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
// We always compute the pointer with most restrictive alignment to avoid
// alignment issues
double2 *accumulator_fft = (double2 *)selected_memory;
Torus *accumulator =
(Torus *)accumulator_fft +
(ptrdiff_t)(sizeof(double2) * params::degree / 2 / sizeof(Torus));
if constexpr (SMD == PARTIALSM)
accumulator_fft = (double2 *)sharedmem;
for (int level = 0; level < level_count; level++) {
double2 *global_fft_slice = global_accumulator_fft +
(level + blockIdx.x * level_count) *
(glwe_dimension + 1) * (params::degree / 2);
for (int j = 0; j < (glwe_dimension + 1); j++) {
double2 *fft = global_fft_slice + j * params::degree / 2;
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
auto bsk_slice =
get_ith_mask_kth_block(bootstrapping_key, lwe_iteration, j, level,
polynomial_size, glwe_dimension, level_count);
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
polynomial_product_accumulate_in_fourier_domain<params, double2>(
accumulator_fft, fft, bsk_poly, !level && !j);
}
}
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
// Load the persisted accumulator
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
accumulator[tid] = global_slice[tid];
tid += params::degree / params::opt;
}
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
// accumulator
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
add_to_torus<Torus, params>(accumulator_fft, accumulator);
if (lwe_iteration + 1 == lwe_dimension) {
// Last iteration
auto block_lwe_array_out =
&lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
}
} else {
// Persist the updated accumulator
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
global_slice[tid] = accumulator[tid];
tid += params::degree / params::opt;
}
}
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_low_latency_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_low_latency_step_two(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_bootstrap_low_latency(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm_step_one =
get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
uint64_t full_dm = full_sm_step_one;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm_step_two) {
device_mem = (partial_dm_step_two + partial_dm_step_one * level_count) *
input_lwe_ciphertext_count * (glwe_dimension + 1);
} else if (max_shared_memory < full_sm_step_one) {
device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
level_count * (glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
uint64_t buffer_size = device_mem +
// global_accumulator_fft
(glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count *
(polynomial_size / 2) * sizeof(double2) +
// global_accumulator
(glwe_dimension + 1) * input_lwe_ciphertext_count *
polynomial_size * sizeof(Torus);
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_bootstrap_low_latency(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
uint64_t full_sm_step_one =
get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
// Configure step one
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_low_latency_step_one<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
cudaFuncSetCacheConfig(
device_bootstrap_low_latency_step_one<Torus, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
// Configure step two
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_low_latency_step_two<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two));
cudaFuncSetCacheConfig(
device_bootstrap_low_latency_step_two<Torus, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
if (allocate_gpu_memory) {
uint64_t buffer_size = get_buffer_size_bootstrap_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
check_cuda_error(cudaGetLastError());
}
}
template <typename Torus, class params>
__host__ void execute_low_latency_step_one(
cuda_stream_t *stream, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, uint32_t max_shared_memory,
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
uint64_t full_sm, uint64_t full_dm) {
int thds = polynomial_size / params::opt;
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
if (max_shared_memory < partial_sm) {
device_bootstrap_low_latency_step_one<Torus, params, NOSM>
<<<grid, thds, 0, stream->stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm);
} else if (max_shared_memory < full_sm) {
device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>
<<<grid, thds, partial_sm, stream->stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm);
} else {
device_bootstrap_low_latency_step_one<Torus, params, FULLSM>
<<<grid, thds, full_sm, stream->stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, 0);
}
check_cuda_error(cudaGetLastError());
}
template <typename Torus, class params>
__host__ void execute_low_latency_step_two(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, double2 *bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, uint32_t max_shared_memory,
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
uint64_t full_sm, uint64_t full_dm) {
int thds = polynomial_size / params::opt;
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
if (max_shared_memory < partial_sm) {
device_bootstrap_low_latency_step_two<Torus, params, NOSM>
<<<grid, thds, 0, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm);
} else if (max_shared_memory < full_sm) {
device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>
<<<grid, thds, partial_sm, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm);
} else {
device_bootstrap_low_latency_step_two<Torus, params, FULLSM>
<<<grid, thds, full_sm, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, 0);
}
check_cuda_error(cudaGetLastError());
}
/*
* Host wrapper to the low latency version
* of bootstrapping
*/
template <typename Torus, class params>
__host__ void host_bootstrap_low_latency(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t max_shared_memory) {
cudaSetDevice(stream->gpu_index);
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
uint64_t full_sm_step_one =
get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
uint64_t full_dm_step_one = full_sm_step_one;
uint64_t full_dm_step_two = full_sm_step_two;
double2 *global_accumulator_fft = (double2 *)pbs_buffer;
Torus *global_accumulator =
(Torus *)global_accumulator_fft +
(ptrdiff_t)(sizeof(double2) * (glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count * (polynomial_size / 2) /
sizeof(Torus));
int8_t *d_mem = (int8_t *)global_accumulator +
(ptrdiff_t)(sizeof(Torus) * (glwe_dimension + 1) *
input_lwe_ciphertext_count * polynomial_size /
sizeof(int8_t));
for (int i = 0; i < lwe_dimension; i++) {
execute_low_latency_step_one<Torus, params>(
stream, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
input_lwe_ciphertext_count, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, d_mem, max_shared_memory, i,
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
execute_low_latency_step_two<Torus, params>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, bootstrapping_key, global_accumulator,
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem,
max_shared_memory, i, partial_sm, partial_dm_step_two, full_sm_step_two,
full_dm_step_two);
}
}
#endif // LOWLAT_PBS_H

View File

@@ -0,0 +1,485 @@
#include "../polynomial/parameters.cuh"
#include "bootstrap_fast_multibit.cuh"
#include "bootstrap_multibit.cuh"
#include "bootstrap_multibit.h"
void checks_multi_bit_pbs(int polynomial_size) {
assert(
("Error (GPU multi-bit PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192, 16384",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192 ||
polynomial_size == 16384));
}
void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
checks_multi_bit_pbs(polynomial_size);
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
break;
case 512:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
break;
case 1024:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<1024>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
break;
case 2048:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
break;
case 4096:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
break;
case 8192:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
break;
case 16384:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<16384>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
break;
default:
break;
}
}
void scratch_cuda_multi_bit_pbs_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t lwe_chunk_size) {
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
}
break;
case 512:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
}
break;
case 1024:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<1024>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
}
break;
case 2048:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
}
break;
case 4096:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
}
break;
case 8192:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
}
break;
case 16384:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<16384>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
}
break;
default:
break;
}
}
void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer) {
// Free memory
cuda_drop_async(*pbs_buffer, stream);
}
// Pick the best possible chunk size for each GPU
__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
uint32_t level_count,
uint32_t glwe_dimension,
uint32_t num_samples) {
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0); // Assuming device 0
const char *v100Name = "V100"; // Known name of V100 GPU
const char *a100Name = "A100"; // Known name of A100 GPU
const char *h100Name = "H100"; // Known name of H100 GPU
if (std::strstr(deviceProp.name, v100Name) != nullptr) {
// Tesla V100
if (num_samples == 1)
return 60;
else if (num_samples == 2)
return 40;
else if (num_samples <= 4)
return 20;
else if (num_samples <= 8)
return 10;
else if (num_samples <= 16)
return 40;
else if (num_samples <= 32)
return 27;
else if (num_samples <= 64)
return 20;
else if (num_samples <= 128)
return 18;
else if (num_samples <= 256)
return 16;
else if (num_samples <= 512)
return 15;
else if (num_samples <= 1024)
return 15;
else
return 12;
} else if (std::strstr(deviceProp.name, a100Name) != nullptr) {
// Tesla A100
if (num_samples < 4)
return 11;
else if (num_samples < 8)
return 6;
else if (num_samples < 16)
return 13;
else if (num_samples < 64)
return 19;
else if (num_samples < 128)
return 1;
else if (num_samples < 512)
return 19;
else if (num_samples < 1024)
return 17;
else if (num_samples < 8192)
return 19;
else if (num_samples < 16384)
return 12;
else
return 9;
} else if (std::strstr(deviceProp.name, h100Name) != nullptr) {
// Tesla H100
return 45;
}
// Generic case
return 1;
}
// Returns a chunk size that is not optimal but close to
__host__ uint32_t get_average_lwe_chunk_size(uint32_t lwe_dimension,
uint32_t level_count,
uint32_t glwe_dimension) {
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0); // Assuming device 0
const char *v100Name = "V100"; // Known name of V100 GPU
const char *a100Name = "A100"; // Known name of A100 GPU
const char *h100Name = "H100"; // Known name of H100 GPU
if (std::strstr(deviceProp.name, v100Name) != nullptr) {
// Tesla V100
return 18;
} else if (std::strstr(deviceProp.name, a100Name) != nullptr) {
// Tesla A100
return 45;
} else if (std::strstr(deviceProp.name, h100Name) != nullptr) {
// Tesla H100
return 45;
}
// Generic case
return 10;
}
// Returns the maximum buffer size required to execute batches up to
// max_input_lwe_ciphertext_count
// todo: Deprecate this function
__host__ uint64_t get_max_buffer_size_multibit_bootstrap(
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_input_lwe_ciphertext_count) {
uint64_t max_buffer_size = 0;
for (uint32_t input_lwe_ciphertext_count = 1;
input_lwe_ciphertext_count <= max_input_lwe_ciphertext_count;
input_lwe_ciphertext_count *= 2) {
max_buffer_size = std::max(
max_buffer_size,
get_buffer_size_multibit_bootstrap<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count,
get_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
input_lwe_ciphertext_count)));
}
return max_buffer_size;
}

View File

@@ -0,0 +1,476 @@
#ifndef CUDA_MULTIBIT_PBS_CUH
#define CUDA_MULTIBIT_PBS_CUH
#include "bootstrap.h"
#include "bootstrap_fast_low_latency.cuh"
#include "bootstrap_multibit.h"
#include "cooperative_groups.h"
#include "crypto/gadget.cuh"
#include "crypto/ggsw.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "types/complex/operations.cuh"
#include <vector>
template <typename Torus, class params>
__device__ Torus calculates_monomial_degree(Torus *lwe_array_group,
uint32_t ggsw_idx,
uint32_t grouping_factor) {
Torus x = 0;
for (int i = 0; i < grouping_factor; i++) {
uint32_t mask_position = grouping_factor - (i + 1);
int selection_bit = (ggsw_idx >> mask_position) & 1;
x += selection_bit * lwe_array_group[i];
}
return rescale_torus_element(
x, 2 * params::degree); // 2 * params::log2_degree + 1);
}
template <typename Torus, class params>
__global__ void device_multi_bit_bootstrap_keybundle(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *keybundle_array,
Torus *bootstrapping_key, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input) {
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory = sharedmem;
// Ids
uint32_t level_id = blockIdx.z;
uint32_t glwe_id = blockIdx.y / (glwe_dimension + 1);
uint32_t poly_id = blockIdx.y % (glwe_dimension + 1);
uint32_t lwe_iteration = (blockIdx.x % lwe_chunk_size + lwe_offset);
uint32_t input_idx = blockIdx.x / lwe_chunk_size;
if (lwe_iteration < (lwe_dimension / grouping_factor)) {
//
Torus *accumulator = (Torus *)selected_memory;
Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
double2 *keybundle = keybundle_array +
// select the input
input_idx * keybundle_size_per_input;
////////////////////////////////////////////////////////////
// Computes all keybundles
uint32_t rev_lwe_iteration =
((lwe_dimension / grouping_factor) - lwe_iteration - 1);
// ////////////////////////////////
// Keygen guarantees the first term is a constant term of the polynomial, no
// polynomial multiplication required
Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
Torus *bsk_poly = bsk_slice + poly_id * params::degree;
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
bsk_poly, accumulator);
// Accumulate the other terms
for (int g = 1; g < (1 << grouping_factor); g++) {
Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
Torus *bsk_poly = bsk_slice + poly_id * params::degree;
// Calculates the monomial degree
Torus *lwe_array_group =
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
lwe_array_group, g, grouping_factor);
synchronize_threads_in_block();
// Multiply by the bsk element
polynomial_product_accumulate_by_monomial<Torus, params>(
accumulator, bsk_poly, monomial_degree, false);
}
synchronize_threads_in_block();
double2 *fft = (double2 *)sharedmem;
// Move accumulator to local memory
double2 temp[params::opt / 2];
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
temp[i].y =
__ll2double_rn((int64_t)accumulator[tid + params::degree / 2]);
temp[i].x /= (double)std::numeric_limits<Torus>::max();
temp[i].y /= (double)std::numeric_limits<Torus>::max();
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// Move from local memory back to shared memory but as complex
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = temp[i];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
NSMFFT_direct<HalfDegree<params>>(fft);
// lwe iteration
auto keybundle_out = get_ith_mask_kth_block(
keybundle, blockIdx.x % lwe_chunk_size, glwe_id, level_id,
polynomial_size, glwe_dimension, level_count);
auto keybundle_poly = keybundle_out + poly_id * params::degree / 2;
copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
fft, keybundle_poly);
}
}
template <typename Torus, class params>
__global__ void device_multi_bit_bootstrap_accumulate_step_one(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t lwe_iteration) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
selected_memory = sharedmem;
Torus *accumulator = (Torus *)selected_memory;
double2 *accumulator_fft =
(double2 *)accumulator +
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
params::degree * (glwe_dimension + 1)];
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
double2 *global_fft_slice =
global_accumulator_fft +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
blockIdx.z * level_count * (glwe_dimension + 1)) *
(polynomial_size / 2);
if (lwe_iteration == 0) {
// First iteration
////////////////////////////////////////////////////////////
// Initializes the accumulator with the body of LWE
// Put "b" in [0, 2N[
Torus b_hat = 0;
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
2 * params::degree);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
// Persist
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
accumulator, global_slice);
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
global_slice, accumulator);
}
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
gadget_acc.decompose_and_compress_next_polynomial(accumulator_fft,
blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
accumulator_fft, global_fft_slice);
}
template <typename Torus, class params>
__global__ void device_multi_bit_bootstrap_accumulate_step_two(
Torus *lwe_array_out, Torus *lwe_output_indexes, double2 *keybundle_array,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor, uint32_t iteration,
uint32_t lwe_offset, uint32_t lwe_chunk_size) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
selected_memory = sharedmem;
double2 *accumulator_fft = (double2 *)selected_memory;
double2 *keybundle = keybundle_array +
// select the input
blockIdx.x * lwe_chunk_size * level_count *
(glwe_dimension + 1) * (glwe_dimension + 1) *
(polynomial_size / 2);
double2 *global_accumulator_fft_input =
global_accumulator_fft +
blockIdx.x * level_count * (glwe_dimension + 1) * (polynomial_size / 2);
for (int level = 0; level < level_count; level++) {
double2 *global_fft_slice =
global_accumulator_fft_input +
level * (glwe_dimension + 1) * (polynomial_size / 2);
for (int j = 0; j < (glwe_dimension + 1); j++) {
double2 *fft = global_fft_slice + j * params::degree / 2;
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
auto bsk_slice =
get_ith_mask_kth_block(keybundle, iteration, j, level,
polynomial_size, glwe_dimension, level_count);
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
polynomial_product_accumulate_in_fourier_domain<params, double2>(
accumulator_fft, fft, bsk_poly, !level && !j);
}
}
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
// accumulator
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
add_to_torus<Torus, params>(accumulator_fft, global_slice, true);
synchronize_threads_in_block();
uint32_t lwe_iteration = iteration + lwe_offset;
if (lwe_iteration + 1 == (lwe_dimension / grouping_factor)) {
// Last iteration
auto block_lwe_array_out =
&lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice);
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
}
}
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_bootstrap_keybundle(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_bootstrap_step_one(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size * 2; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_bootstrap_step_two(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_multibit_bootstrap(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size) {
uint64_t buffer_size = 0;
buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
(glwe_dimension + 1) * (glwe_dimension + 1) *
(polynomial_size / 2) * sizeof(double2); // keybundle fft
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
level_count * (polynomial_size / 2) *
sizeof(double2); // global_accumulator_fft
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
polynomial_size * sizeof(Torus); // global_accumulator
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void
scratch_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count,
uint32_t grouping_factor, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
cudaSetDevice(stream->gpu_index);
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
polynomial_size);
uint64_t full_sm_accumulate_step_one =
get_buffer_size_full_sm_multibit_bootstrap_step_one<Torus>(
polynomial_size);
uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_bootstrap_step_two<Torus>(
polynomial_size);
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_bootstrap_keybundle<Torus, params>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
cudaFuncSetCacheConfig(device_multi_bit_bootstrap_keybundle<Torus, params>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_bootstrap_accumulate_step_one<Torus, params>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
full_sm_accumulate_step_one));
cudaFuncSetCacheConfig(
device_multi_bit_bootstrap_accumulate_step_one<Torus, params>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_bootstrap_accumulate_step_two<Torus, params>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
full_sm_accumulate_step_two));
cudaFuncSetCacheConfig(
device_multi_bit_bootstrap_accumulate_step_two<Torus, params>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
if (allocate_gpu_memory) {
if (!lwe_chunk_size)
lwe_chunk_size = get_average_lwe_chunk_size(lwe_dimension, level_count,
glwe_dimension);
uint64_t buffer_size = get_buffer_size_multibit_bootstrap<Torus>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
check_cuda_error(cudaGetLastError());
}
}
template <typename Torus, typename STorus, class params>
__host__ void host_multi_bit_pbs(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, uint64_t *bootstrapping_key, int8_t *pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
cudaSetDevice(stream->gpu_index);
// If a chunk size is not passed to this function, select one.
if (!lwe_chunk_size)
lwe_chunk_size =
get_average_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension);
//
double2 *keybundle_fft = (double2 *)pbs_buffer;
double2 *global_accumulator_fft =
(double2 *)keybundle_fft +
num_samples * lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2);
Torus *global_accumulator =
(Torus *)global_accumulator_fft +
(ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
level_count * (polynomial_size / 2) / sizeof(Torus));
//
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
polynomial_size);
uint64_t full_sm_accumulate_step_one =
get_buffer_size_full_sm_multibit_bootstrap_step_one<Torus>(
polynomial_size);
uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_bootstrap_step_two<Torus>(
polynomial_size);
uint32_t keybundle_size_per_input =
lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2);
//
dim3 grid_accumulate_step_one(level_count, glwe_dimension + 1, num_samples);
dim3 grid_accumulate_step_two(num_samples, glwe_dimension + 1);
dim3 thds(polynomial_size / params::opt, 1, 1);
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
uint32_t chunk_size = std::min(
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
// Compute a keybundle
dim3 grid_keybundle(num_samples * chunk_size,
(glwe_dimension + 1) * (glwe_dimension + 1),
level_count);
device_multi_bit_bootstrap_keybundle<Torus, params>
<<<grid_keybundle, thds, full_sm_keybundle, stream->stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, lwe_offset, chunk_size,
keybundle_size_per_input);
check_cuda_error(cudaGetLastError());
// Accumulate
for (int j = 0; j < chunk_size; j++) {
device_multi_bit_bootstrap_accumulate_step_one<Torus, params>
<<<grid_accumulate_step_one, thds, full_sm_accumulate_step_one,
stream->stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
lut_vector_indexes, global_accumulator,
global_accumulator_fft, lwe_dimension,
glwe_dimension, polynomial_size, base_log,
level_count, j + lwe_offset);
check_cuda_error(cudaGetLastError());
device_multi_bit_bootstrap_accumulate_step_two<Torus, params>
<<<grid_accumulate_step_two, thds, full_sm_accumulate_step_two,
stream->stream>>>(lwe_array_out, lwe_output_indexes, keybundle_fft,
global_accumulator, global_accumulator_fft,
lwe_dimension, glwe_dimension, polynomial_size,
level_count, grouping_factor, j, lwe_offset,
lwe_chunk_size);
check_cuda_error(cudaGetLastError());
}
}
}
#endif // MULTIBIT_PBS_H

View File

@@ -0,0 +1,500 @@
#ifndef CUDA_BSK_CUH
#define CUDA_BSK_CUH
#include "bootstrap.h"
#include "bootstrap_multibit.h"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "polynomial/parameters.cuh"
#include <atomic>
#include <cstdint>
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count) {
return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) *
level_count;
}
////////////////////////////////////////////////
template <typename T>
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t level_count) {
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
level_count) +
level * polynomial_size / 2 * (glwe_dimension + 1) *
(glwe_dimension + 1) +
k * polynomial_size / 2 * (glwe_dimension + 1)];
}
template <typename T>
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t level_count) {
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
level_count) +
level * polynomial_size / 2 * (glwe_dimension + 1) *
(glwe_dimension + 1) +
k * polynomial_size / 2 * (glwe_dimension + 1) +
glwe_dimension * polynomial_size / 2];
}
////////////////////////////////////////////////
__device__ inline int get_start_ith_lwe(uint32_t i, uint32_t grouping_factor,
uint32_t polynomial_size,
uint32_t glwe_dimension,
uint32_t level_count) {
return i * (1 << grouping_factor) * polynomial_size / 2 *
(glwe_dimension + 1) * (glwe_dimension + 1) * level_count;
}
template <typename T>
__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count) {
T *ptr_group = ptr + get_start_ith_lwe(i, grouping_factor, polynomial_size,
glwe_dimension, level_count);
return get_ith_mask_kth_block(ptr_group, g, k, level, polynomial_size,
glwe_dimension, level_count);
}
////////////////////////////////////////////////
template <typename T, typename ST>
void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
cuda_stream_t *stream,
uint32_t input_lwe_dim, uint32_t glwe_dim,
uint32_t level_count,
uint32_t polynomial_size,
uint32_t total_polynomials) {
cudaSetDevice(stream->gpu_index);
int shared_memory_size = sizeof(double) * polynomial_size;
// Here the buffer size is the size of double2 times the number of polynomials
// times the polynomial size over 2 because the polynomials are compressed
// into the complex domain to perform the FFT
size_t buffer_size =
total_polynomials * polynomial_size / 2 * sizeof(double2);
int gridSize = total_polynomials;
int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
double2 *h_bsk = (double2 *)malloc(buffer_size);
double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream);
// compress real bsk to complex and divide it on DOUBLE_MAX
for (int i = 0; i < total_polynomials; i++) {
int complex_current_poly_idx = i * polynomial_size / 2;
int torus_current_poly_idx = i * polynomial_size;
for (int j = 0; j < polynomial_size / 2; j++) {
h_bsk[complex_current_poly_idx + j].x = src[torus_current_poly_idx + j];
h_bsk[complex_current_poly_idx + j].y =
src[torus_current_poly_idx + j + polynomial_size / 2];
h_bsk[complex_current_poly_idx + j].x /=
(double)std::numeric_limits<T>::max();
h_bsk[complex_current_poly_idx + j].y /=
(double)std::numeric_limits<T>::max();
}
}
cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream);
double2 *buffer;
switch (polynomial_size) {
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
d_bsk, dest, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream);
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
}
break;
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
d_bsk, dest, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream);
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
}
break;
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
d_bsk, dest, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream);
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
}
break;
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
d_bsk, dest, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream);
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
}
break;
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
d_bsk, dest, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream);
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
}
break;
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
d_bsk, dest, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream);
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
}
break;
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
d_bsk, dest, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream);
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
}
break;
default:
break;
}
cuda_drop_async(d_bsk, stream);
cuda_drop_async(buffer, stream);
free(h_bsk);
}
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
cuda_stream_t *stream,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>(
(double2 *)dest, (int32_t *)src, stream, input_lwe_dim, glwe_dim,
level_count, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
cuda_stream_t *stream,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>(
(double2 *)dest, (int64_t *)src, stream, input_lwe_dim, glwe_dim,
level_count, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_multi_bit_bootstrap_key_64(
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
uint32_t grouping_factor) {
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
level_count * (1 << grouping_factor) /
grouping_factor;
size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t);
cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size,
stream);
}
void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
cuda_stream_t *stream,
uint32_t polynomial_size,
uint32_t total_polynomials) {
auto input1 = (double2 *)_input1;
auto input2 = (double2 *)_input2;
auto output = (double2 *)_output;
size_t shared_memory_size = sizeof(double2) * polynomial_size / 2;
int gridSize = total_polynomials;
int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
double2 *buffer;
switch (polynomial_size) {
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream);
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
buffer);
}
break;
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream);
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
buffer);
}
break;
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream);
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
buffer);
}
break;
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream);
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
buffer);
}
break;
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream);
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
buffer);
}
break;
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream);
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
buffer);
}
break;
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream);
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
buffer);
}
break;
default:
break;
}
cuda_drop_async(buffer, stream);
}
// We need these lines so the compiler knows how to specialize these functions
template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
#endif // CNCRT_BSK_H

View File

@@ -0,0 +1,305 @@
#ifndef GPU_POLYNOMIAL_FUNCTIONS_CUH
#define GPU_POLYNOMIAL_FUNCTIONS_CUH
#include "crypto/torus.cuh"
#include "device.h"
// Return A if C == 0 and B if C == 1
#define SEL(A, B, C) ((-(C) & ((A) ^ (B))) ^ (A))
/*
* function compresses decomposed buffer into half size complex buffer for fft
*/
template <class params>
__device__ void real_to_complex_compressed(int16_t *src, double2 *dst) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
dst[tid].x = __int2double_rn(src[2 * tid]);
dst[tid].y = __int2double_rn(src[2 * tid + 1]);
tid += params::degree / params::opt;
}
}
/*
* copy source polynomial to specific slice of batched polynomials
* used only in low latency version
*/
template <typename T, class params>
__device__ void copy_into_ith_polynomial_low_lat(T *source, T *dst, int i) {
int tid = threadIdx.x;
int begin = i * (params::degree / 2 + 1);
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
dst[tid + begin] = source[tid];
tid = tid + params::degree / params::opt;
}
if (threadIdx.x == 0) {
dst[params::degree / 2 + begin] = source[params::degree / 2];
}
}
template <typename T, int elems_per_thread, int block_size>
__device__ void copy_polynomial(T *source, T *dst) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < elems_per_thread; i++) {
dst[tid] = source[tid];
tid = tid + block_size;
}
}
/*
* accumulates source polynomial into specific slice of batched polynomial
* used only in low latency version
*/
template <typename T, class params>
__device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
int tid = threadIdx.x;
int begin = p_id * (params::degree / 2 + 1);
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
dst[tid] += source[tid + begin];
tid = tid + params::degree / params::opt;
}
if (threadIdx.x == 0) {
dst[params::degree / 2] += source[params::degree / 2 + begin];
}
}
/*
* Receives num_poly concatenated polynomials of type T. For each:
*
* Performs acc = acc * (X^ä + 1) if zeroAcc = false
* Performs acc = 0 if zeroAcc
* takes single buffer and calculates inplace.
*
* By default, it works on a single polynomial.
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
uint32_t j, bool zeroAcc,
uint32_t num_poly = 1) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
T *input_slice = (T *)input + (ptrdiff_t)(z * degree);
int tid = threadIdx.x;
if (zeroAcc) {
for (int i = 0; i < elems_per_thread; i++) {
accumulator_slice[tid] = 0;
tid += block_size;
}
} else {
tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
if (j < degree) {
// if (tid < degree - j)
// accumulator_slice[tid] = input_slice[tid + j];
// else
// accumulator_slice[tid] = -input_slice[tid - degree + j];
int x = tid + j - SEL(degree, 0, tid < degree - j);
accumulator_slice[tid] =
SEL(-1, 1, tid < degree - j) * input_slice[x];
} else {
int32_t jj = j - degree;
// if (tid < degree - jj)
// accumulator_slice[tid] = -input_slice[tid + jj];
// else
// accumulator_slice[tid] = input_slice[tid - degree + jj];
int x = tid + jj - SEL(degree, 0, tid < degree - jj);
accumulator_slice[tid] =
SEL(1, -1, tid < degree - jj) * input_slice[x];
}
tid += block_size;
}
}
}
}
/*
* Receives num_poly concatenated polynomials of type T. For each:
*
* Performs result_acc = acc * (X^ä - 1) - acc
* takes single buffer as input and returns a single rotated buffer
*
* By default, it works on a single polynomial.
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
T *result_acc_slice = (T *)result_acc + (ptrdiff_t)(z * degree);
int tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
if (j < degree) {
// if (tid < j)
// result_acc_slice[tid] = -acc_slice[tid - j + degree]-acc_slice[tid];
// else
// result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid];
int x = tid - j + SEL(0, degree, tid < j);
result_acc_slice[tid] =
SEL(1, -1, tid < j) * acc_slice[x] - acc_slice[tid];
} else {
int32_t jj = j - degree;
// if (tid < jj)
// result_acc_slice[tid] = acc_slice[tid - jj + degree]-acc_slice[tid];
// else
// result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid];
int x = tid - jj + SEL(0, degree, tid < jj);
result_acc_slice[tid] =
SEL(-1, 1, tid < jj) * acc_slice[x] - acc_slice[tid];
}
tid += block_size;
}
}
}
/*
* Receives num_poly concatenated polynomials of type T. For each performs a
* rounding to increase accuracy of the PBS. Calculates inplace.
*
* By default, it works on a single polynomial.
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
int level_count,
uint32_t num_poly = 1) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
int tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
T x_acc = rotated_acc_slice[tid];
T shift = sizeof(T) * 8 - level_count * base_log;
T mask = 1ll << (shift - 1);
T b_acc = (x_acc & mask) >> (shift - 1);
T res_acc = x_acc >> shift;
res_acc += b_acc;
res_acc <<= shift;
rotated_acc_slice[tid] = res_acc;
tid = tid + block_size;
}
}
}
template <typename Torus, class params>
__device__ void add_to_torus(double2 *m_values, Torus *result,
bool init_torus = false) {
Torus mx = (sizeof(Torus) == 4) ? UINT32_MAX : UINT64_MAX;
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
double v1 = m_values[tid].x;
double v2 = m_values[tid].y;
double frac = v1 - floor(v1);
frac *= mx;
double carry = frac - floor(frac);
frac += (carry >= 0.5);
Torus V1 = 0;
typecast_double_to_torus<Torus>(frac, V1);
frac = v2 - floor(v2);
frac *= mx;
carry = frac - floor(v2);
frac += (carry >= 0.5);
Torus V2 = 0;
typecast_double_to_torus<Torus>(frac, V2);
if (init_torus) {
result[tid] = V1;
result[tid + params::degree / 2] = V2;
} else {
result[tid] += V1;
result[tid + params::degree / 2] += V2;
}
tid = tid + params::degree / params::opt;
}
}
// Extracts the body of a GLWE.
// k is the offset to find the body element / polynomial in the lwe_array_out /
// accumulator
template <typename Torus, class params>
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
uint32_t k) {
// Set first coefficient of the accumulator as the body of the LWE sample
lwe_array_out[k * params::degree] = accumulator[k * params::degree];
}
// Extracts the mask from num_poly polynomials individually
template <typename Torus, class params>
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
uint32_t num_poly = 1) {
for (int z = 0; z < num_poly; z++) {
Torus *lwe_array_out_slice =
(Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
Torus *accumulator_slice =
(Torus *)accumulator + (ptrdiff_t)(z * params::degree);
// Set ACC = -ACC
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
accumulator_slice[tid] = -accumulator_slice[tid];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
// Reverse the accumulator
tid = threadIdx.x;
Torus result[params::opt];
#pragma unroll
for (int i = 0; i < params::opt; i++) {
result[i] = accumulator_slice[params::degree - tid - 1];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
accumulator_slice[tid] = result[i];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
// Perform ACC * X
// (equivalent to multiply_by_monomial_negacyclic_inplace(1))
tid = threadIdx.x;
result[params::opt];
for (int i = 0; i < params::opt; i++) {
// if (tid < 1)
// result[i] = -accumulator_slice[tid - 1 + params::degree];
// else
// result[i] = accumulator_slice[tid - 1];
int x = tid - 1 + SEL(0, params::degree, tid < 1);
result[i] = SEL(1, -1, tid < 1) * accumulator_slice[x];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
accumulator_slice[tid] = result[i];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// Copy to the mask of the LWE sample
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
lwe_array_out_slice[tid] = accumulator_slice[tid];
tid = tid + params::degree / params::opt;
}
}
}
#endif

View File

@@ -0,0 +1,91 @@
#ifndef CUDA_PARAMETERS_CUH
#define CUDA_PARAMETERS_CUH
constexpr int log2(int n) { return (n <= 2) ? 1 : 1 + log2(n / 2); }
constexpr int choose_opt_amortized(int degree) {
if (degree <= 1024)
return 4;
else if (degree == 2048)
return 8;
else if (degree == 4096)
return 16;
else if (degree == 8192)
return 32;
else
return 64;
}
constexpr int choose_opt(int degree) {
if (degree <= 1024)
return 4;
else if (degree == 2048)
return 4;
else if (degree == 4096)
return 4;
else if (degree == 8192)
return 8;
else if (degree == 16384)
return 16;
else
return 64;
}
template <class params> class HalfDegree {
public:
constexpr static int degree = params::degree / 2;
constexpr static int opt = params::opt / 2;
constexpr static int log2_degree = params::log2_degree - 1;
};
template <int N> class Degree {
public:
constexpr static int degree = N;
constexpr static int opt = choose_opt(N);
constexpr static int log2_degree = log2(N);
};
template <int N> class AmortizedDegree {
public:
constexpr static int degree = N;
constexpr static int opt = choose_opt_amortized(N);
constexpr static int log2_degree = log2(N);
};
enum sharedMemDegree {
NOSM = 0,
PARTIALSM = 1,
FULLSM = 2
};
class ForwardFFT {
public:
constexpr static int direction = 0;
};
class BackwardFFT {
public:
constexpr static int direction = 1;
};
class ReorderFFT {
constexpr static int reorder = 1;
};
class NoReorderFFT {
constexpr static int reorder = 0;
};
template <class params, class direction, class reorder = ReorderFFT>
class FFTDegree : public params {
public:
constexpr static int fft_direction = direction::direction;
constexpr static int fft_reorder = reorder::reorder;
};
template <int N, class direction, class reorder = ReorderFFT>
class FFTParams : public Degree<N> {
public:
constexpr static int fft_direction = direction::direction;
constexpr static int fft_reorder = reorder::reorder;
};
#endif // CNCRT_PARAMETERS_H

View File

@@ -0,0 +1,86 @@
#ifndef CUDA_POLYNOMIAL_MATH_CUH
#define CUDA_POLYNOMIAL_MATH_CUH
#include "crypto/torus.cuh"
#include "parameters.cuh"
template <typename T>
__device__ T *get_chunk(T *data, int chunk_num, int chunk_size) {
int pos = chunk_num * chunk_size;
T *ptr = &data[pos];
return ptr;
}
template <typename FT, class params>
__device__ void sub_polynomial(FT *result, FT *first, FT *second) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
result[tid] = first[tid] - second[tid];
tid += params::degree / params::opt;
}
}
template <class params, typename T>
__device__ void polynomial_product_in_fourier_domain(T *result, T *first,
T *second) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
result[tid] = first[tid] * second[tid];
tid += params::degree / params::opt;
}
if (threadIdx.x == 0) {
result[params::degree / 2] =
first[params::degree / 2] * second[params::degree / 2];
}
}
// Computes result += first * second
// If init_accumulator is set, assumes that result was not initialized and does
// that with the outcome of first * second
template <class params, typename T>
__device__ void
polynomial_product_accumulate_in_fourier_domain(T *result, T *first, T *second,
bool init_accumulator = false) {
int tid = threadIdx.x;
if (init_accumulator) {
for (int i = 0; i < params::opt / 2; i++) {
result[tid] = first[tid] * second[tid];
tid += params::degree / params::opt;
}
} else {
for (int i = 0; i < params::opt / 2; i++) {
result[tid] += first[tid] * second[tid];
tid += params::degree / params::opt;
}
}
}
// If init_accumulator is set, assumes that result was not initialized and does
// that with the outcome of first * second
template <typename T, class params>
__device__ void
polynomial_product_accumulate_by_monomial(T *result, T *poly,
uint64_t monomial_degree,
bool init_accumulator = false) {
// monomial_degree \in [0, 2 * params::degree)
int full_cycles_count = monomial_degree / params::degree;
int remainder_degrees = monomial_degree % params::degree;
int pos = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
T element = poly[pos];
int new_pos = (pos + monomial_degree) % params::degree;
T x = SEL(element, -element, full_cycles_count % 2); // monomial coefficient
x = SEL(-x, x, new_pos >= remainder_degrees);
if (init_accumulator)
result[new_pos] = x;
else
result[new_pos] += x;
pos += params::degree / params::opt;
}
}
#endif // CNCRT_POLYNOMIAL_MATH_H

View File

@@ -0,0 +1,97 @@
#ifndef GPU_BOOTSTRAP_COMMON_CUH
#define GPU_BOOTSTRAP_COMMON_CUH
#include <cassert>
#include <cstdint>
#include <cstdio>
#define SNT 1
#define dPI 6.283185307179586231995926937088
using sTorus = int32_t;
// using Torus = uint32_t;
using sTorus = int32_t;
using u32 = uint32_t;
using i32 = int32_t;
//--------------------------------------------------
// Basic double2 operations
__device__ inline double2 conjugate(const double2 num) {
double2 res;
res.x = num.x;
res.y = -num.y;
return res;
}
__device__ inline void operator+=(double2 &lh, const double2 rh) {
lh.x += rh.x;
lh.y += rh.y;
}
__device__ inline void operator-=(double2 &lh, const double2 rh) {
lh.x -= rh.x;
lh.y -= rh.y;
}
__device__ inline double2 operator+(const double2 a, const double2 b) {
double2 res;
res.x = a.x + b.x;
res.y = a.y + b.y;
return res;
}
__device__ inline double2 operator-(const double2 a, const double2 b) {
double2 res;
res.x = a.x - b.x;
res.y = a.y - b.y;
return res;
}
__device__ inline double2 operator*(const double2 a, const double2 b) {
double xx = a.x * b.x;
double xy = a.x * b.y;
double yx = a.y * b.x;
double yy = a.y * b.y;
double2 res;
// asm volatile("fma.rn.f64 %0, %1, %2, %3;": "=d"(res.x) : "d"(a.x),
// "d"(b.x), "d"(yy));
res.x = xx - yy;
res.y = xy + yx;
return res;
}
__device__ inline double2 operator*(const double2 a, double b) {
double2 res;
res.x = a.x * b;
res.y = a.y * b;
return res;
}
__device__ inline void operator*=(double2 &a, const double2 b) {
double tmp = a.x;
a.x *= b.x;
a.x -= a.y * b.y;
a.y *= b.x;
a.y += b.y * tmp;
}
__device__ inline void operator*=(double2 &a, const double b) {
a.x *= b;
a.y *= b;
}
__device__ inline void operator/=(double2 &a, const double b) {
a.x /= b;
a.y /= b;
}
__device__ inline double2 operator*(double a, double2 b) {
double2 res;
res.x = b.x * a;
res.y = b.y * a;
return res;
}
#endif

View File

@@ -0,0 +1,76 @@
#ifndef CNCRT_INT128_CUH
#define CNCRT_INT128_CUH
// abseil's int128 type
// licensed under Apache license
class uint128 {
public:
__device__ uint128(uint64_t high, uint64_t low) : hi_(high), lo_(low) {}
uint64_t lo_;
uint64_t hi_;
};
class int128 {
public:
int128() = default;
__device__ operator unsigned long long() const {
return static_cast<unsigned long long>(lo_);
}
__device__ int128(int64_t high, uint64_t low) : hi_(high), lo_(low) {}
uint64_t lo_;
int64_t hi_;
};
__device__ inline uint128 make_uint128(uint64_t high, uint64_t low) {
return uint128(high, low);
}
template <typename T> __device__ uint128 make_uint128_from_float(T v) {
if (v >= ldexp(static_cast<T>(1), 64)) {
uint64_t hi = static_cast<uint64_t>(ldexp(v, -64));
uint64_t lo = static_cast<uint64_t>(v - ldexp(static_cast<T>(hi), 64));
return make_uint128(hi, lo);
}
return make_uint128(0, static_cast<uint64_t>(v));
}
__device__ inline int128 make_int128(int64_t high, uint64_t low) {
return int128(high, low);
}
__device__ inline int64_t bitcast_to_signed(uint64_t v) {
return v & (uint64_t{1} << 63) ? ~static_cast<int64_t>(~v)
: static_cast<int64_t>(v);
}
__device__ inline uint64_t uint128_high64(uint128 v) { return v.hi_; }
__device__ inline uint64_t uint128_low64(uint128 v) { return v.lo_; }
__device__ __forceinline__ uint128 operator-(uint128 val) {
uint64_t hi = ~uint128_high64(val);
uint64_t lo = ~uint128_low64(val) + 1;
if (lo == 0)
++hi; // carry
return make_uint128(hi, lo);
}
template <typename T> __device__ int128 make_int128_from_float(T v) {
// We must convert the absolute value and then negate as needed, because
// floating point types are typically sign-magnitude. Otherwise, the
// difference between the high and low 64 bits when interpreted as two's
// complement overwhelms the precision of the mantissa.
uint128 result =
v < 0 ? -make_uint128_from_float(-v) : make_uint128_from_float(v);
return make_int128(bitcast_to_signed(uint128_high64(result)),
uint128_low64(result));
}
#endif

View File

@@ -0,0 +1,36 @@
#ifndef HELPER_CUH
#define HELPER_CUH
#include <stdio.h>
template <typename T> __global__ void print_debug_kernel(T *src, int N) {
for (int i = 0; i < N; i++) {
printf("%lu, ", src[i]);
}
}
template <typename T> void print_debug(const char *name, T *src, int N) {
printf("%s: ", name);
cudaDeviceSynchronize();
print_debug_kernel<<<1, 1>>>(src, N);
cudaDeviceSynchronize();
printf("\n");
}
template <typename T>
__global__ void print_body_kernel(T *src, int N, int lwe_dimension) {
for (int i = 0; i < N; i++) {
printf("%lu, ", src[i * (lwe_dimension + 1) + lwe_dimension]);
}
}
template <typename T>
void print_body(const char *name, T *src, int n, int lwe_dimension) {
printf("%s: ", name);
cudaDeviceSynchronize();
print_body_kernel<<<1, 1>>>(src, n, lwe_dimension);
cudaDeviceSynchronize();
printf("\n");
}
#endif

View File

@@ -0,0 +1,21 @@
#ifndef KERNEL_DIMENSIONS_CUH
#define KERNEL_DIMENSIONS_CUH
inline int nextPow2(int x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
inline void getNumBlocksAndThreads(const int n, const int maxBlockSize,
int &blocks, int &threads) {
threads =
(n < maxBlockSize * 2) ? max(128, nextPow2((n + 1) / 2)) : maxBlockSize;
blocks = (n + threads - 1) / threads;
}
#endif // KERNEL_DIMENSIONS_H

View File

@@ -0,0 +1,18 @@
[package]
name = "tfhe-cuda-backend"
version = "0.1.2"
edition = "2021"
authors = ["Zama team"]
license = "BSD-3-Clause-Clear"
description = "Cuda implementation of TFHE-rs primitives."
homepage = "https://www.zama.ai/"
documentation = "https://docs.zama.ai/tfhe-rs"
repository = "https://github.com/zama-ai/tfhe-rs"
readme = "README.md"
keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
[build-dependencies]
cmake = { version = "0.1" }
[dependencies]
thiserror = "1.0"

View File

@@ -0,0 +1,28 @@
BSD 3-Clause Clear License
Copyright © 2023 ZAMA.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or other
materials provided with the distribution.
3. Neither the name of ZAMA nor the names of its contributors may be used to endorse
or promote products derived from this software without specific prior written permission.
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE.
THIS SOFTWARE IS PROVIDED BY THE ZAMA AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
ZAMA OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -0,0 +1,28 @@
use std::env;
use std::process::Command;
fn main() {
println!("Build tfhe-cuda-backend");
if env::consts::OS == "linux" {
let output = Command::new("./get_os_name.sh").output().unwrap();
let distribution = String::from_utf8(output.stdout).unwrap();
if distribution != "Ubuntu\n" {
println!(
"cargo:warning=This Linux distribution is not officially supported. \
Only Ubuntu is supported by tfhe-cuda-backend at this time. Build may fail\n"
);
}
let dest = cmake::build("../implementation");
println!("cargo:rustc-link-search=native={}", dest.display());
println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
println!("cargo:rustc-link-lib=gomp");
println!("cargo:rustc-link-lib=cudart");
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
println!("cargo:rustc-link-lib=stdc++");
} else {
panic!(
"Error: platform not supported, tfhe-cuda-backend not built (only Linux is supported)"
);
}
}

View File

@@ -0,0 +1,3 @@
#!/bin/bash
cat /etc/os-release | grep "\<NAME\>" | sed "s/NAME=\"//g" | sed "s/\"//g"

View File

@@ -0,0 +1,794 @@
use std::ffi::c_void;
#[link(name = "tfhe_cuda_backend", kind = "static")]
extern "C" {
/// Create a new Cuda stream on GPU `gpu_index`
pub fn cuda_create_stream(gpu_index: u32) -> *mut c_void;
/// Destroy the Cuda stream `v_stream` on GPU `gpu_index`
pub fn cuda_destroy_stream(v_stream: *mut c_void) -> i32;
/// Allocate `size` memory on GPU `gpu_index` asynchronously
pub fn cuda_malloc_async(size: u64, v_stream: *const c_void) -> *mut c_void;
/// Copy `size` memory asynchronously from `src` on GPU `gpu_index` to `dest` on CPU using
/// the Cuda stream `v_stream`.
pub fn cuda_memcpy_async_to_cpu(
dest: *mut c_void,
src: *const c_void,
size: u64,
v_stream: *const c_void,
) -> i32;
/// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using
/// the Cuda stream `v_stream`.
pub fn cuda_memcpy_async_to_gpu(
dest: *mut c_void,
src: *const c_void,
size: u64,
v_stream: *const c_void,
) -> i32;
/// Copy `size` memory asynchronously from `src` to `dest` on the same GPU `gpu_index` using
/// the Cuda stream `v_stream`.
pub fn cuda_memcpy_async_gpu_to_gpu(
dest: *mut c_void,
src: *const c_void,
size: u64,
v_stream: *const c_void,
) -> i32;
/// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using
/// the Cuda stream `v_stream`.
pub fn cuda_memset_async(
dest: *mut c_void,
value: u64,
size: u64,
v_stream: *const c_void,
) -> i32;
/// Get the total number of Nvidia GPUs detected on the platform
pub fn cuda_get_number_of_gpus() -> i32;
/// Synchronize all streams on GPU `gpu_index`
pub fn cuda_synchronize_device(gpu_index: u32) -> i32;
/// Synchronize Cuda stream
pub fn cuda_synchronize_stream(v_stream: *const c_void) -> i32;
/// Free memory for pointer `ptr` on GPU `gpu_index` asynchronously, using stream `v_stream`
pub fn cuda_drop_async(ptr: *mut c_void, v_stream: *const c_void) -> i32;
/// Free memory for pointer `ptr` on GPU `gpu_index` synchronously
pub fn cuda_drop(ptr: *mut c_void) -> i32;
/// Get the maximum amount of shared memory on GPU `gpu_index`
pub fn cuda_get_max_shared_memory(gpu_index: u32) -> i32;
/// Copy a bootstrap key `src` represented with 64 bits in the standard domain from the CPU to
/// the GPU `gpu_index` using the stream `v_stream`, and convert it to the Fourier domain on the
/// GPU. The resulting bootstrap key `dest` on the GPU is an array of f64 values.
pub fn cuda_convert_lwe_bootstrap_key_64(
dest: *mut c_void,
src: *const c_void,
v_stream: *const c_void,
input_lwe_dim: u32,
glwe_dim: u32,
level_count: u32,
polynomial_size: u32,
);
/// Copy a multi-bit bootstrap key `src` represented with 64 bits in the standard domain from
/// the CPU to the GPU `gpu_index` using the stream `v_stream`. The resulting bootstrap key
/// `dest` on the GPU is an array of uint64_t values.
pub fn cuda_convert_lwe_multi_bit_bootstrap_key_64(
dest: *mut c_void,
src: *const c_void,
v_stream: *const c_void,
input_lwe_dim: u32,
glwe_dim: u32,
level_count: u32,
polynomial_size: u32,
grouping_factor: u32,
);
/// Copy `number_of_cts` LWE ciphertext represented with 64 bits in the standard domain from the
/// CPU to the GPU `gpu_index` using the stream `v_stream`. All ciphertexts must be
/// concatenated.
pub fn cuda_convert_lwe_ciphertext_vector_to_gpu_64(
dest: *mut c_void,
src: *mut c_void,
v_stream: *const c_void,
number_of_cts: u32,
lwe_dimension: u32,
);
/// Copy `number_of_cts` LWE ciphertext represented with 64 bits in the standard domain from the
/// GPU to the CPU `gpu_index` using the stream `v_stream`. All ciphertexts must be
/// concatenated.
pub fn cuda_convert_lwe_ciphertext_vector_to_cpu_64(
dest: *mut c_void,
src: *mut c_void,
v_stream: *const c_void,
number_of_cts: u32,
lwe_dimension: u32,
);
/// This scratch function allocates the necessary amount of data on the GPU for
/// the low latency PBS on 64-bit inputs, into `pbs_buffer`. It also configures SM
/// options on the GPU in case FULLSM or PARTIALSM mode are going to be used.
pub fn scratch_cuda_bootstrap_low_latency_64(
v_stream: *const c_void,
pbs_buffer: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
level_count: u32,
input_lwe_ciphertext_count: u32,
max_shared_memory: u32,
allocate_gpu_memory: bool,
);
/// Perform bootstrapping on a batch of input u64 LWE ciphertexts.
///
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
/// - `lwe_array_out`: output batch of num_samples bootstrapped ciphertexts c =
/// (a0,..an-1,b) where n is the LWE dimension
/// - `lut_vector`: should hold as many test vectors of size polynomial_size
/// as there are input ciphertexts, but actually holds
/// `num_lut_vectors` vectors to reduce memory usage
/// - `lut_vector_indexes`: stores the index corresponding to
/// which test vector to use for each sample in
/// `lut_vector`
/// - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing n
/// mask values + 1 body value
/// - `bootstrapping_key`: GGSW encryption of the LWE secret key sk1
/// under secret key sk2.
/// bsk = Z + sk1 H
/// where H is the gadget matrix and Z is a matrix (k+1).l
/// containing GLWE encryptions of 0 under sk2.
/// bsk is thus a tensor of size (k+1)^2.l.N.n
/// where l is the number of decomposition levels and
/// k is the GLWE dimension, N is the polynomial size for
/// GLWE. The polynomial size for GLWE and the test vector
/// are the same because they have to be in the same ring
/// to be multiplied.
/// - `pbs_buffer`: a preallocated buffer to store temporary results
/// - `lwe_dimension`: size of the Torus vector used to encrypt the input
/// LWE ciphertexts - referred to as n above (~ 600)
/// - `glwe_dimension`: size of the polynomial vector used to encrypt the LUT
/// GLWE ciphertexts - referred to as k above. Only the value 1 is supported for this parameter.
/// - `polynomial_size`: size of the test polynomial (test vector) and size of the
/// GLWE polynomial (~1024)
/// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8)
/// - `level_count`: number of decomposition levels in the gadget matrix (~4)
/// - `num_samples`: number of encrypted input messages
/// - `num_lut_vectors`: parameter to set the actual number of test vectors to be
/// used
/// - `lwe_idx`: the index of the LWE input to consider for the GPU of index gpu_index. In
/// case of multi-GPU computing, it is assumed that only a part of the input LWE array is
/// copied to each GPU, but the whole LUT array is copied (because the case when the number
/// of LUTs is smaller than the number of input LWEs is not trivial to take into account in
/// the data repartition on the GPUs). `lwe_idx` is used to determine which LUT to consider
/// for a given LWE input in the LUT array `lut_vector`.
/// - `max_shared_memory` maximum amount of shared memory to be used inside
/// device functions
///
/// This function calls a wrapper to a device kernel that performs the
/// bootstrapping:
/// - the kernel is templatized based on integer discretization and
/// polynomial degree
/// - num_samples * level_count * (glwe_dimension + 1) blocks of threads are launched, where
/// each thread is going to handle one or more polynomial coefficients at each stage,
/// for a given level of decomposition, either for the LUT mask or its body:
/// - perform the blind rotation
/// - round the result
/// - get the decomposition for the current level
/// - switch to the FFT domain
/// - multiply with the bootstrapping key
/// - come back to the coefficients representation
/// - between each stage a synchronization of the threads is necessary (some
/// synchronizations
/// happen at the block level, some happen between blocks, using cooperative groups).
/// - in case the device has enough shared memory, temporary arrays used for
/// the different stages (accumulators) are stored into the shared memory
/// - the accumulators serve to combine the results for all decomposition
/// levels
/// - the constant memory (64K) is used for storing the roots of identity
/// values for the FFT
pub fn cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
v_stream: *const c_void,
lwe_array_out: *mut c_void,
lwe_output_indexes: *const c_void,
lut_vector: *const c_void,
lut_vector_indexes: *const c_void,
lwe_array_in: *const c_void,
lwe_input_indexes: *const c_void,
bootstrapping_key: *const c_void,
pbs_buffer: *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
base_log: u32,
level: u32,
num_samples: u32,
num_lut_vectors: u32,
lwe_idx: u32,
max_shared_memory: u32,
);
/// This cleanup function frees the data for the low latency PBS on GPU
/// contained in pbs_buffer for 32 or 64-bit inputs.
pub fn cleanup_cuda_bootstrap_low_latency(v_stream: *const c_void, pbs_buffer: *mut *mut i8);
/// This scratch function allocates the necessary amount of data on the GPU for
/// the multi-bit PBS on 64-bit inputs into `pbs_buffer`.
pub fn scratch_cuda_multi_bit_pbs_64(
v_stream: *const c_void,
pbs_buffer: *mut *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
level_count: u32,
grouping_factor: u32,
input_lwe_ciphertext_count: u32,
max_shared_memory: u32,
allocate_gpu_memory: bool,
lwe_chunk_size: u32,
);
/// Perform bootstrapping on a batch of input u64 LWE ciphertexts using the multi-bit algorithm.
///
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
/// - `lwe_array_out`: output batch of num_samples bootstrapped ciphertexts c =
/// (a0,..an-1,b) where n is the LWE dimension
/// - `lut_vector`: should hold as many test vectors of size polynomial_size
/// as there are input ciphertexts, but actually holds
/// `num_lut_vectors` vectors to reduce memory usage
/// - `lut_vector_indexes`: stores the index corresponding to
/// which test vector to use for each sample in
/// `lut_vector`
/// - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing n
/// mask values + 1 body value
/// - `bootstrapping_key`: GGSW encryption of elements of the LWE secret key as in the
/// classical PBS, but this time we follow Zhou's trick and encrypt combinations of elements
/// of the key
/// - `pbs_buffer`: a preallocated buffer to store temporary results
/// - `lwe_dimension`: size of the Torus vector used to encrypt the input
/// LWE ciphertexts - referred to as n above (~ 600)
/// - `glwe_dimension`: size of the polynomial vector used to encrypt the LUT
/// GLWE ciphertexts - referred to as k above. Only the value 1 is supported for this parameter.
/// - `polynomial_size`: size of the test polynomial (test vector) and size of the
/// GLWE polynomial (~1024)
/// - `grouping_factor`: number of elements of the LWE secret key combined per GGSW of the
/// bootstrap key
/// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8)
/// - `level_count`: number of decomposition levels in the gadget matrix (~4)
/// - `num_samples`: number of encrypted input messages
/// - `num_lut_vectors`: parameter to set the actual number of test vectors to be
/// used
/// - `lwe_idx`: the index of the LWE input to consider for the GPU of index gpu_index. In
/// case of multi-GPU computing, it is assumed that only a part of the input LWE array is
/// copied to each GPU, but the whole LUT array is copied (because the case when the number
/// of LUTs is smaller than the number of input LWEs is not trivial to take into account in
/// the data repartition on the GPUs). `lwe_idx` is used to determine which LUT to consider
/// for a given LWE input in the LUT array `lut_vector`.
/// - `max_shared_memory` maximum amount of shared memory to be used inside
/// device functions
pub fn cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
v_stream: *const c_void,
lwe_array_out: *mut c_void,
lwe_output_indexes: *const c_void,
lut_vector: *const c_void,
lut_vector_indexes: *const c_void,
lwe_array_in: *const c_void,
lwe_input_indexes: *const c_void,
bootstrapping_key: *const c_void,
pbs_buffer: *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
grouping_factor: u32,
base_log: u32,
level: u32,
num_samples: u32,
num_lut_vectors: u32,
lwe_idx: u32,
max_shared_memory: u32,
lwe_chunk_size: u32,
);
/// This cleanup function frees the data for the multi-bit PBS on GPU
/// contained in pbs_buffer for 64-bit inputs.
pub fn cleanup_cuda_multi_bit_pbs(v_stream: *const c_void, pbs_buffer: *mut *mut i8);
/// Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
///
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
/// - `lwe_array_out`: output batch of num_samples keyswitched ciphertexts c =
/// (a0,..an-1,b) where n is the output LWE dimension (lwe_dimension_out)
/// - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing lwe_dimension_in
/// mask values + 1 body value
/// - `ksk`: the keyswitch key to be used in the operation
/// - `base_log`: the log of the base used in the decomposition (should be the one used to
/// create the ksk).
/// - `level_count`: the number of levels used in the decomposition (should be the one used to
/// create the ksk).
/// - `num_samples`: the number of input and output LWE ciphertexts.
///
/// This function calls a wrapper to a device kernel that performs the keyswitch.
/// `num_samples` blocks of threads are launched
pub fn cuda_keyswitch_lwe_ciphertext_vector_64(
v_stream: *const c_void,
lwe_array_out: *mut c_void,
lwe_output_indexes: *const c_void,
lwe_array_in: *const c_void,
lwe_input_indexes: *const c_void,
keyswitch_key: *const c_void,
input_lwe_dimension: u32,
output_lwe_dimension: u32,
base_log: u32,
level_count: u32,
num_samples: u32,
);
/// Perform the negation of a u64 input LWE ciphertext vector.
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
/// - `lwe_array_out` is an array of size
/// `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have been allocated on
/// the GPU before calling this function, and that will hold the result of the computation.
/// - `lwe_array_in` is the LWE ciphertext vector used as input, it should have been
/// allocated and initialized before calling this function. It has the same size as the output
/// array.
/// - `input_lwe_dimension` is the number of mask elements in the two input and in the output
/// ciphertext vectors
/// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each input LWE
/// ciphertext vector, as well as in the output.
///
/// Each element (mask element or body) of the input LWE ciphertext vector is negated.
/// The result is stored in the output LWE ciphertext vector. The input LWE ciphertext vector
/// is left unchanged. This function is a wrapper to a device function that performs the
/// operation on the GPU.
pub fn cuda_negate_lwe_ciphertext_vector_64(
v_stream: *const c_void,
lwe_array_out: *mut c_void,
lwe_array_in: *const c_void,
input_lwe_dimension: u32,
input_lwe_ciphertext_count: u32,
);
pub fn cuda_negate_integer_radix_ciphertext_64_inplace(
v_stream: *const c_void,
lwe_array: *mut c_void,
lwe_dimension: u32,
lwe_ciphertext_count: u32,
message_modulus: u32,
carry_modulus: u32,
);
/// Perform the addition of two u64 input LWE ciphertext vectors.
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
/// - `lwe_array_out` is an array of size
/// `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have been allocated on
/// the GPU before calling this function, and that will hold the result of the computation.
/// - `lwe_array_in_1` is the first LWE ciphertext vector used as input, it should have been
/// allocated and initialized before calling this function. It has the same size as the output
/// array.
/// - `lwe_array_in_2` is the second LWE ciphertext vector used as input, it should have been
/// allocated and initialized before calling this function. It has the same size as the output
/// array.
/// - `input_lwe_dimension` is the number of mask elements in the two input and in the output
/// ciphertext vectors
/// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each input LWE
/// ciphertext vector, as well as in the output.
///
/// Each element (mask element or body) of the input LWE ciphertext vector 1 is added to the
/// corresponding element in the input LWE ciphertext 2. The result is stored in the output LWE
/// ciphertext vector. The two input LWE ciphertext vectors are left unchanged. This function is
/// a wrapper to a device function that performs the operation on the GPU.
pub fn cuda_add_lwe_ciphertext_vector_64(
v_stream: *const c_void,
lwe_array_out: *mut c_void,
lwe_array_in_1: *const c_void,
lwe_array_in_2: *const c_void,
input_lwe_dimension: u32,
input_lwe_ciphertext_count: u32,
);
/// Perform the addition of a u64 input LWE ciphertext vector with a u64 input plaintext vector.
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
/// - `lwe_array_out` is an array of size
/// `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have been allocated
/// on the GPU before calling this function, and that will hold the result of the computation.
/// - `lwe_array_in` is the LWE ciphertext vector used as input, it should have been
/// allocated and initialized before calling this function. It has the same size as the output
/// array.
/// - `plaintext_array_in` is the plaintext vector used as input, it should have been
/// allocated and initialized before calling this function. It should be of size
/// `input_lwe_ciphertext_count`.
/// - `input_lwe_dimension` is the number of mask elements in the input and output LWE
/// ciphertext vectors
/// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the input LWE
/// ciphertext vector, as well as in the output. It is also the number of plaintexts in the
/// input plaintext vector.
///
/// Each plaintext of the input plaintext vector is added to the body of the corresponding LWE
/// ciphertext in the LWE ciphertext vector. The result of the operation is stored in the output
/// LWE ciphertext vector. The two input vectors are unchanged. This function is a
/// wrapper to a device function that performs the operation on the GPU.
pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
v_stream: *const c_void,
lwe_array_out: *mut c_void,
lwe_array_in: *const c_void,
plaintext_array_in: *const c_void,
input_lwe_dimension: u32,
input_lwe_ciphertext_count: u32,
);
/// Perform the multiplication of a u64 input LWE ciphertext vector with a u64 input cleartext
/// vector.
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
/// - `lwe_array_out` is an array of size
/// `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have been allocated
/// on the GPU before calling this function, and that will hold the result of the computation.
/// - `lwe_array_in` is the LWE ciphertext vector used as input, it should have been
/// allocated and initialized before calling this function. It has the same size as the output
/// array.
/// - `cleartext_array_in` is the cleartext vector used as input, it should have been
/// allocated and initialized before calling this function. It should be of size
/// `input_lwe_ciphertext_count`.
/// - `input_lwe_dimension` is the number of mask elements in the input and output LWE
/// ciphertext vectors
/// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the input LWE
/// ciphertext vector, as well as in the output. It is also the number of cleartexts in the
/// input cleartext vector.
///
/// Each cleartext of the input cleartext vector is multiplied to the mask and body of the
/// corresponding LWE ciphertext in the LWE ciphertext vector.
/// The result of the operation is stored in the output
/// LWE ciphertext vector. The two input vectors are unchanged. This function is a
/// wrapper to a device function that performs the operation on the GPU.
pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
v_stream: *const c_void,
lwe_array_out: *mut c_void,
lwe_array_in: *const c_void,
cleartext_array_in: *const c_void,
input_lwe_dimension: u32,
input_lwe_ciphertext_count: u32,
);
pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64(
v_stream: *const c_void,
mem_ptr: *mut *mut i8,
message_modulus: u32,
carry_modulus: u32,
glwe_dimension: u32,
lwe_dimension: u32,
polynomial_size: u32,
pbs_base_log: u32,
pbs_level: u32,
ks_base_log: u32,
ks_level: u32,
grouping_factor: u32,
num_blocks: u32,
pbs_type: u32,
max_shared_memory: u32,
allocate_gpu_memory: bool,
);
pub fn cuda_integer_mult_radix_ciphertext_kb_64(
v_stream: *const c_void,
radix_lwe_out: *mut c_void,
radix_lwe_left: *const c_void,
radix_lwe_right: *const c_void,
bsk: *const c_void,
ksk: *const c_void,
mem_ptr: *mut i8,
message_modulus: u32,
carry_modulus: u32,
glwe_dimension: u32,
lwe_dimension: u32,
polynomial_size: u32,
pbs_base_log: u32,
pbs_level: u32,
ks_base_log: u32,
ks_level: u32,
grouping_factor: u32,
num_blocks: u32,
pbs_type: u32,
max_shared_memory: u32,
);
pub fn cleanup_cuda_integer_mult(v_stream: *const c_void, mem_ptr: *mut *mut i8);
pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
v_stream: *const c_void,
lwe_array: *mut c_void,
scalar_input: *const c_void,
lwe_dimension: u32,
lwe_ciphertext_count: u32,
message_modulus: u32,
carry_modulus: u32,
);
pub fn cuda_small_scalar_multiplication_integer_radix_ciphertext_64_inplace(
v_stream: *const c_void,
lwe_array: *mut c_void,
scalar_input: u64,
lwe_dimension: u32,
lwe_ciphertext_count: u32,
);
pub fn scratch_cuda_integer_radix_bitop_kb_64(
v_stream: *const c_void,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: u32,
op_type: u32,
allocate_gpu_memory: bool,
);
pub fn cuda_bitop_integer_radix_ciphertext_kb_64(
v_stream: *const c_void,
radix_lwe_out: *mut c_void,
radix_lwe_left: *const c_void,
radix_lwe_right: *const c_void,
mem_ptr: *mut i8,
bsk: *const c_void,
ksk: *const c_void,
num_blocks: u32,
);
pub fn cuda_bitnot_integer_radix_ciphertext_kb_64(
v_stream: *const c_void,
radix_lwe_out: *mut c_void,
radix_lwe_in: *const c_void,
mem_ptr: *mut i8,
bsk: *const c_void,
ksk: *const c_void,
num_blocks: u32,
);
pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
v_stream: *const c_void,
radix_lwe_output: *mut c_void,
radix_lwe_input: *mut c_void,
clear_blocks: *const c_void,
num_clear_blocks: u32,
mem_ptr: *mut i8,
bsk: *const c_void,
ksk: *const c_void,
num_blocks: u32,
op_type: u32,
);
pub fn cleanup_cuda_integer_bitop(v_stream: *const c_void, mem_ptr: *mut *mut i8);
pub fn scratch_cuda_integer_radix_comparison_kb_64(
v_stream: *const c_void,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: u32,
op_type: u32,
allocate_gpu_memory: bool,
);
pub fn cuda_comparison_integer_radix_ciphertext_kb_64(
v_stream: *const c_void,
radix_lwe_out: *mut c_void,
radix_lwe_left: *const c_void,
radix_lwe_right: *const c_void,
mem_ptr: *mut i8,
bsk: *const c_void,
ksk: *const c_void,
num_blocks: u32,
);
pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
v_stream: *const c_void,
radix_lwe_out: *mut c_void,
radix_lwe_in: *const c_void,
scalar_blocks: *const c_void,
mem_ptr: *mut i8,
bsk: *const c_void,
ksk: *const c_void,
num_blocks: u32,
num_scalar_blocks: u32,
);
pub fn cleanup_cuda_integer_comparison(v_stream: *const c_void, mem_ptr: *mut *mut i8);
pub fn scratch_cuda_full_propagation_64(
v_stream: *const c_void,
mem_ptr: *mut *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
pbs_level: u32,
grouping_factor: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: u32,
allocate_gpu_memory: bool,
);
pub fn cuda_full_propagation_64_inplace(
v_stream: *const c_void,
radix_lwe_right: *mut c_void,
mem_ptr: *mut i8,
ksk: *const c_void,
bsk: *const c_void,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
ks_base_log: u32,
ks_level: u32,
pbs_base_log: u32,
pbs_level: u32,
grouping_factor: u32,
num_blocks: u32,
);
pub fn cleanup_cuda_full_propagation(v_stream: *const c_void, mem_ptr: *mut *mut i8);
pub fn scratch_cuda_integer_radix_scalar_shift_kb_64(
v_stream: *const c_void,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: u32,
shift_type: u32,
allocate_gpu_memory: bool,
);
pub fn cuda_integer_radix_scalar_shift_kb_64_inplace(
v_stream: *const c_void,
radix_lwe: *mut c_void,
shift: u32,
mem_ptr: *mut i8,
bsk: *const c_void,
ksk: *const c_void,
num_blocks: u32,
);
pub fn cleanup_cuda_integer_radix_scalar_shift(v_stream: *const c_void, mem_ptr: *mut *mut i8);
pub fn scratch_cuda_integer_radix_cmux_kb_64(
v_stream: *const c_void,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: u32,
allocate_gpu_memory: bool,
);
pub fn cuda_cmux_integer_radix_ciphertext_kb_64(
v_stream: *const c_void,
lwe_array_out: *mut c_void,
lwe_condition: *const c_void,
lwe_array_true: *const c_void,
lwe_array_false: *const c_void,
mem_ptr: *mut i8,
bsk: *const c_void,
ksk: *const c_void,
num_blocks: u32,
);
pub fn cleanup_cuda_integer_radix_cmux(v_stream: *const c_void, mem_ptr: *mut *mut i8);
pub fn scratch_cuda_integer_radix_scalar_rotate_kb_64(
v_stream: *const c_void,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: u32,
shift_type: u32,
allocate_gpu_memory: bool,
);
pub fn cuda_integer_radix_scalar_rotate_kb_64_inplace(
v_stream: *const c_void,
radix_lwe: *mut c_void,
n: u32,
mem_ptr: *mut i8,
bsk: *const c_void,
ksk: *const c_void,
num_blocks: u32,
);
pub fn cleanup_cuda_integer_radix_scalar_rotate(v_stream: *const c_void, mem_ptr: *mut *mut i8);
pub fn scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(
v_stream: *const c_void,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: u32,
allocate_gpu_memory: bool,
);
pub fn cuda_propagate_single_carry_low_latency_kb_64_inplace(
v_stream: *const c_void,
radix_lwe: *mut c_void,
mem_ptr: *mut i8,
bsk: *const c_void,
ksk: *const c_void,
num_blocks: u32,
);
pub fn cleanup_cuda_propagate_single_carry_low_latency(
v_stream: *const c_void,
mem_ptr: *mut *mut i8,
);
}

View File

@@ -0,0 +1 @@
pub mod cuda_bind;

View File

@@ -1,4 +1,7 @@
{
"m6i.metal": 7.168,
"hpc7a.96xlarge": 7.7252
"hpc7a.96xlarge": 7.7252,
"p3.2xlarge": 3.06,
"p4d.24xlarge": 32.7726,
"p5.48xlarge": 98.32
}

View File

@@ -18,6 +18,31 @@ region = "eu-west-1"
image_id = "ami-0e88d98b86aff13de"
instance_type = "hpc7a.96xlarge"
[profile.gpu-test]
region = "us-east-1"
image_id = "ami-05b4b37bcbb24dc48"
instance_type = "p3.2xlarge"
# One spawn attempt every 30 seconds for 1 hour
spawn_retry_attempts = 120
spawn_retry_duration = 60
[profile.gpu-bench]
region = "us-east-1"
image_id = "ami-05b4b37bcbb24dc48"
instance_type = "p4d.24xlarge"
# One spawn attempt every 30 seconds for 6 hours
spawn_retry_attempts = 720
spawn_retry_duration = 360
max_spot_hourly_price = "100.0"
[profile.gpu-bench-big]
region = "us-east-1"
image_id = "ami-05b4b37bcbb24dc48"
instance_type = "p5.48xlarge"
spawn_retry_attempts = 720
spawn_retry_duration = 360
max_spot_hourly_price = "150.0"
[command.cpu_test]
workflow = "aws_tfhe_tests.yml"
profile = "cpu-big"
@@ -43,21 +68,36 @@ workflow = "aws_tfhe_fast_tests.yml"
profile = "cpu-big"
check_run_name = "CPU AWS Fast Tests"
[command.integer_full_bench]
workflow = "integer_full_benchmark.yml"
profile = "bench"
check_run_name = "Integer CPU AWS Benchmarks Full Suite"
[command.gpu_test]
workflow = "aws_tfhe_gpu_tests.yml"
profile = "gpu-test"
check_run_name = "GPU AWS Tests"
[command.signed_integer_full_bench]
workflow = "signed_integer_full_benchmark.yml"
profile = "bench"
check_run_name = "Signed Integer CPU AWS Benchmarks Full Suite"
[command.integer_full_bench]
workflow = "integer_full_benchmark.yml"
profile = "bench"
check_run_name = "Integer CPU AWS Benchmarks Full Suite"
[command.integer_gpu_full_bench]
workflow = "integer_gpu_full_benchmark.yml"
profile = "gpu-test" # p3.2xlarge is the baseline for GPU benchmarks
check_run_name = "Integer GPU AWS Benchmarks Full Suite"
[command.integer_bench]
workflow = "integer_benchmark.yml"
profile = "bench"
check_run_name = "Integer CPU AWS Benchmarks"
[command.integer_gpu_bench]
workflow = "integer_gpu_benchmark.yml"
profile = "gpu-test"
check_run_name = "Integer GPU AWS Benchmarks"
[command.integer_multi_bit_bench]
workflow = "integer_multi_bit_benchmark.yml"
profile = "bench"
@@ -73,6 +113,11 @@ workflow = "signed_integer_multi_bit_benchmark.yml"
profile = "bench"
check_run_name = "Signed integer multi bit CPU AWS Benchmarks"
[command.integer_multi_bit_gpu_bench]
workflow = "integer_multi_bit_gpu_benchmark.yml"
profile = "gpu-bench"
check_run_name = "Integer multi bit GPU AWS Benchmarks"
[command.shortint_full_bench]
workflow = "shortint_full_benchmark.yml"
profile = "bench"

View File

@@ -60,6 +60,7 @@ rayon = { version = "1.5.0" }
bincode = "1.3.3"
concrete-fft = { version = "0.3.0", features = ["serde", "fft128"] }
pulp = "0.13"
tfhe-cuda-backend = {path = "../backends/tfhe-cuda-backend/rust_api", optional = true}
aligned-vec = { version = "0.5", features = ["serde"] }
dyn-stack = { version = "0.9" }
paste = "1.0.7"
@@ -83,6 +84,7 @@ boolean = []
shortint = []
integer = ["shortint"]
internal-keycache = ["dep:lazy_static", "dep:fs2"]
gpu = ["tfhe-cuda-backend"]
# Experimental section
experimental = []

View File

@@ -21,6 +21,7 @@ use tfhe::integer::U256;
use tfhe::shortint::parameters::{
PARAM_MESSAGE_1_CARRY_1_KS_PBS, PARAM_MESSAGE_2_CARRY_2_KS_PBS, PARAM_MESSAGE_3_CARRY_3_KS_PBS,
PARAM_MESSAGE_4_CARRY_4_KS_PBS, PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS,
PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS,
};
/// The type used to hold scalar values
@@ -57,6 +58,11 @@ impl Default for ParamsAndNumBlocksIter {
Err(_) => false,
};
let is_gpu = match env::var("__TFHE_RS_BENCH_OP_FLAVOR") {
Ok(val) => val.contains("gpu"),
Err(_) => false,
};
let bit_sizes = if is_fast_bench {
FAST_BENCH_BIT_SIZES.to_vec()
} else {
@@ -64,7 +70,18 @@ impl Default for ParamsAndNumBlocksIter {
};
if is_multi_bit {
let params = vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS.into()];
let params = if is_gpu {
vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS.into()]
} else {
vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS.into()]
};
let bit_sizes = if is_fast_bench {
vec![32]
} else {
BENCH_BIT_SIZES.to_vec()
};
let params_and_bit_sizes = iproduct!(params, bit_sizes);
Self {
params_and_bit_sizes,
@@ -77,6 +94,7 @@ impl Default for ParamsAndNumBlocksIter {
// PARAM_MESSAGE_3_CARRY_3_KS_PBS.into(),
// PARAM_MESSAGE_4_CARRY_4_KS_PBS.into(),
];
let params_and_bit_sizes = iproduct!(params, bit_sizes);
Self {
params_and_bit_sizes,
@@ -1136,6 +1154,709 @@ define_server_key_bench_default_fn!(
display_name: rotate_right
);
#[cfg(feature = "gpu")]
mod cuda {
use super::{default_scalar, shift_scalar, ParamsAndNumBlocksIter, ScalarType};
use crate::utilities::{write_to_json, OperatorType};
use criterion::{criterion_group, Criterion};
use rand::prelude::*;
use tfhe::core_crypto::gpu::{CudaDevice, CudaStream};
use tfhe::integer::gpu::ciphertext::CudaRadixCiphertext;
use tfhe::integer::gpu::server_key::CudaServerKey;
use tfhe::integer::keycache::KEY_CACHE;
use tfhe::integer::IntegerKeyKind;
use tfhe::keycache::NamedParam;
fn bench_cuda_server_key_unary_function_clean_inputs<F>(
c: &mut Criterion,
bench_name: &str,
display_name: &str,
unary_op: F,
) where
F: Fn(&CudaServerKey, &mut CudaRadixCiphertext, &CudaStream),
{
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(60));
let mut rng = rand::thread_rng();
let gpu_index = 0;
let device = CudaDevice::new(gpu_index);
let stream = CudaStream::new_unchecked(device);
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
let param_name = param.name();
let bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
let encrypt_two_values = || {
let clearlow = rng.gen::<u128>();
let clearhigh = rng.gen::<u128>();
let clear_0 = tfhe::integer::U256::from((clearlow, clearhigh));
let ct_0 = cks.encrypt_radix(clear_0, num_block);
let d_ctxt_1 = CudaRadixCiphertext::from_radix_ciphertext(&ct_0, &stream);
d_ctxt_1
};
b.iter_batched(
encrypt_two_values,
|mut ct_0| {
unary_op(&gpu_sks, &mut ct_0, &stream);
},
criterion::BatchSize::SmallInput,
)
});
write_to_json::<u64, _>(
&bench_id,
param,
param.name(),
display_name,
&OperatorType::Atomic,
bit_size as u32,
vec![param.message_modulus().0.ilog2(); num_block],
);
}
bench_group.finish()
}
/// Base function to bench a server key function that is a binary operation, input ciphertext
/// will contain only zero carries
fn bench_cuda_server_key_binary_function_clean_inputs<F>(
c: &mut Criterion,
bench_name: &str,
display_name: &str,
binary_op: F,
) where
F: Fn(&CudaServerKey, &mut CudaRadixCiphertext, &mut CudaRadixCiphertext, &CudaStream),
{
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(60));
let mut rng = rand::thread_rng();
let gpu_index = 0;
let device = CudaDevice::new(gpu_index);
let stream = CudaStream::new_unchecked(device);
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
let param_name = param.name();
let bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
let encrypt_two_values = || {
let clearlow = rng.gen::<u128>();
let clearhigh = rng.gen::<u128>();
let clear_0 = tfhe::integer::U256::from((clearlow, clearhigh));
let ct_0 = cks.encrypt_radix(clear_0, num_block);
let clearlow = rng.gen::<u128>();
let clearhigh = rng.gen::<u128>();
let clear_1 = tfhe::integer::U256::from((clearlow, clearhigh));
let ct_1 = cks.encrypt_radix(clear_1, num_block);
let d_ctxt_1 = CudaRadixCiphertext::from_radix_ciphertext(&ct_0, &stream);
let d_ctxt_2 = CudaRadixCiphertext::from_radix_ciphertext(&ct_1, &stream);
(d_ctxt_1, d_ctxt_2)
};
b.iter_batched(
encrypt_two_values,
|(mut ct_0, mut ct_1)| {
binary_op(&gpu_sks, &mut ct_0, &mut ct_1, &stream);
},
criterion::BatchSize::SmallInput,
)
});
write_to_json::<u64, _>(
&bench_id,
param,
param.name(),
display_name,
&OperatorType::Atomic,
bit_size as u32,
vec![param.message_modulus().0.ilog2(); num_block],
);
}
bench_group.finish()
}
fn bench_cuda_server_key_binary_scalar_function_clean_inputs<F, G>(
c: &mut Criterion,
bench_name: &str,
display_name: &str,
binary_op: F,
rng_func: G,
) where
F: Fn(&CudaServerKey, &mut CudaRadixCiphertext, ScalarType, &CudaStream),
G: Fn(&mut ThreadRng, usize) -> ScalarType,
{
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(60));
let mut rng = rand::thread_rng();
let gpu_index = 0;
let device = CudaDevice::new(gpu_index);
let stream = CudaStream::new_unchecked(device);
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
if bit_size > ScalarType::BITS as usize {
break;
}
let param_name = param.name();
let max_value_for_bit_size = ScalarType::MAX >> (ScalarType::BITS as usize - bit_size);
let bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits_scalar_{bit_size}");
bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
let encrypt_one_value = || {
let clearlow = rng.gen::<u128>();
let clearhigh = rng.gen::<u128>();
let clear_0 = tfhe::integer::U256::from((clearlow, clearhigh));
let ct_0 = cks.encrypt_radix(clear_0, num_block);
let d_ctxt_1 = CudaRadixCiphertext::from_radix_ciphertext(&ct_0, &stream);
let clear_1 = rng_func(&mut rng, bit_size) & max_value_for_bit_size;
(d_ctxt_1, clear_1)
};
b.iter_batched(
encrypt_one_value,
|(mut ct_0, clear_1)| {
binary_op(&gpu_sks, &mut ct_0, clear_1, &stream);
},
criterion::BatchSize::SmallInput,
)
});
write_to_json::<u64, _>(
&bench_id,
param,
param.name(),
display_name,
&OperatorType::Atomic,
bit_size as u32,
vec![param.message_modulus().0.ilog2(); num_block],
);
}
bench_group.finish()
}
fn cuda_default_if_then_else(c: &mut Criterion) {
let mut bench_group = c.benchmark_group("integer::cuda::if_then_else");
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(60));
let mut rng = rand::thread_rng();
let gpu_index = 0;
let device = CudaDevice::new(gpu_index);
let stream = CudaStream::new_unchecked(device);
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
if bit_size > ScalarType::BITS as usize {
break;
}
let param_name = param.name();
let bench_id = format!("if_then_else:{param_name}::{bit_size}_bits_scalar_{bit_size}");
bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
let encrypt_tree_values = || {
let clear_cond = rng.gen::<bool>();
let ct_cond =
cks.encrypt_radix(tfhe::integer::U256::from(clear_cond), num_block);
let clearlow = rng.gen::<u128>();
let clearhigh = rng.gen::<u128>();
let clear_0 = tfhe::integer::U256::from((clearlow, clearhigh));
let ct_then = cks.encrypt_radix(clear_0, num_block);
let clearlow = rng.gen::<u128>();
let clearhigh = rng.gen::<u128>();
let clear_1 = tfhe::integer::U256::from((clearlow, clearhigh));
let ct_else = cks.encrypt_radix(clear_1, num_block);
let d_ct_cond = CudaRadixCiphertext::from_radix_ciphertext(&ct_cond, &stream);
let d_ct_then = CudaRadixCiphertext::from_radix_ciphertext(&ct_then, &stream);
let d_ct_else = CudaRadixCiphertext::from_radix_ciphertext(&ct_else, &stream);
(d_ct_cond, d_ct_then, d_ct_else)
};
b.iter_batched(
encrypt_tree_values,
|(ct_cond, ct_then, ct_else)| {
let _ = gpu_sks.if_then_else(&ct_cond, &ct_then, &ct_else, &stream);
},
criterion::BatchSize::SmallInput,
)
});
write_to_json::<u64, _>(
&bench_id,
param,
param.name(),
"if_then_else",
&OperatorType::Atomic,
bit_size as u32,
vec![param.message_modulus().0.ilog2(); num_block],
);
}
bench_group.finish()
}
macro_rules! define_cuda_server_key_bench_clean_input_unary_fn (
(method_name: $server_key_method:ident, display_name:$name:ident) => {
::paste::paste!{
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
bench_cuda_server_key_unary_function_clean_inputs(
c,
concat!("integer::cuda::", stringify!($server_key_method)),
stringify!($name),
|server_key, lhs, stream| {
server_key.$server_key_method(lhs, stream);
}
)
}
}
}
);
macro_rules! define_cuda_server_key_bench_clean_input_fn (
(method_name: $server_key_method:ident, display_name:$name:ident) => {
::paste::paste!{
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
bench_cuda_server_key_binary_function_clean_inputs(
c,
concat!("integer::cuda::", stringify!($server_key_method)),
stringify!($name),
|server_key, lhs, rhs, stream| {
server_key.$server_key_method(lhs, rhs, stream);
}
)
}
}
}
);
macro_rules! define_cuda_server_key_bench_clean_input_scalar_fn (
(method_name: $server_key_method:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => {
::paste::paste!{
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
bench_cuda_server_key_binary_scalar_function_clean_inputs(
c,
concat!("integer::cuda::", stringify!($server_key_method)),
stringify!($name),
|server_key, lhs, rhs, stream| {
server_key.$server_key_method(lhs, rhs, stream);
},
$($rng_fn)*
)
}
}
}
);
//===========================================
// Unchecked
//===========================================
define_cuda_server_key_bench_clean_input_unary_fn!(
method_name: unchecked_neg,
display_name: negation
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: unchecked_bitand,
display_name: bitand
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: unchecked_bitor,
display_name: bitor
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: unchecked_bitxor,
display_name: bitxor
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: unchecked_mul,
display_name: mul
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: unchecked_add,
display_name: add
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: unchecked_sub,
display_name: sub
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: unchecked_eq,
display_name: equal
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: unchecked_ne,
display_name: not_equal
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_bitand,
display_name: bitand,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_bitor,
display_name: bitand,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_bitxor,
display_name: bitand,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_add,
display_name: add,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_sub,
display_name: sub,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_left_shift,
display_name: left_shift,
rng_func: shift_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_right_shift,
display_name: right_shift,
rng_func: shift_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_left_rotate,
display_name: left_rotate,
rng_func: shift_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_right_rotate,
display_name: right_rotate,
rng_func: shift_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_gt,
display_name: greater_than,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_ge,
display_name: greater_or_equal,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_lt,
display_name: less_than,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_le,
display_name: less_or_equal,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_max,
display_name: max,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: unchecked_scalar_min,
display_name: min,
rng_func: default_scalar
);
//===========================================
// Default
//===========================================
define_cuda_server_key_bench_clean_input_unary_fn!(
method_name: neg,
display_name: negation
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: add,
display_name: add
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: sub,
display_name: sub
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: mul,
display_name: mul
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: ne,
display_name: not_equal
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: eq,
display_name: equal
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: bitand,
display_name: bitand
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: bitor,
display_name: bitor
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: bitxor,
display_name: bitxor
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: gt,
display_name: greater_than
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: ge,
display_name: greater_or_equal
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: lt,
display_name: less_than
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: le,
display_name: less_or_equal
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: max,
display_name: max
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: min,
display_name: min
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: scalar_sub,
display_name: sub,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: scalar_add,
display_name: add,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: scalar_left_shift,
display_name: left_shift,
rng_func: shift_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: scalar_right_shift,
display_name: right_shift,
rng_func: shift_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: scalar_bitand,
display_name: bitand,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: scalar_bitor,
display_name: bitor,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: scalar_bitxor,
display_name: bitxor,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: scalar_gt,
display_name: greater_than,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: scalar_ge,
display_name: greater_or_equal,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: scalar_lt,
display_name: less_than,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: scalar_le,
display_name: less_or_equal,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: scalar_max,
display_name: max,
rng_func: default_scalar
);
define_cuda_server_key_bench_clean_input_scalar_fn!(
method_name: scalar_min,
display_name: min,
rng_func: default_scalar
);
criterion_group!(
unchecked_cuda_ops,
cuda_unchecked_neg,
cuda_unchecked_bitand,
cuda_unchecked_bitor,
cuda_unchecked_bitxor,
cuda_unchecked_mul,
cuda_unchecked_sub,
cuda_unchecked_add,
cuda_unchecked_eq,
cuda_unchecked_ne,
);
criterion_group!(
unchecked_scalar_cuda_ops,
cuda_unchecked_scalar_bitand,
cuda_unchecked_scalar_bitor,
cuda_unchecked_scalar_bitxor,
cuda_unchecked_scalar_add,
cuda_unchecked_scalar_sub,
cuda_unchecked_scalar_left_shift,
cuda_unchecked_scalar_right_shift,
cuda_unchecked_scalar_left_rotate,
cuda_unchecked_scalar_right_rotate,
cuda_unchecked_scalar_ge,
cuda_unchecked_scalar_gt,
cuda_unchecked_scalar_le,
cuda_unchecked_scalar_lt,
cuda_unchecked_scalar_max,
cuda_unchecked_scalar_min,
);
criterion_group!(
default_cuda_ops,
cuda_neg,
cuda_sub,
cuda_add,
cuda_mul,
cuda_eq,
cuda_ne,
cuda_ge,
cuda_gt,
cuda_le,
cuda_lt,
cuda_max,
cuda_min,
cuda_bitand,
cuda_bitor,
cuda_bitxor,
cuda_default_if_then_else,
);
criterion_group!(
default_scalar_cuda_ops,
cuda_scalar_sub,
cuda_scalar_add,
cuda_scalar_left_shift,
cuda_scalar_right_shift,
cuda_scalar_bitand,
cuda_scalar_bitor,
cuda_scalar_bitxor,
cuda_scalar_ge,
cuda_scalar_gt,
cuda_scalar_le,
cuda_scalar_lt,
cuda_scalar_max,
cuda_scalar_min,
);
}
#[cfg(feature = "gpu")]
use cuda::{
default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops, unchecked_scalar_cuda_ops,
};
criterion_group!(
smart_ops,
smart_neg,
@@ -1371,35 +2092,56 @@ criterion_group!(
criterion_group!(misc, full_propagate, full_propagate_parallelized);
#[cfg(feature = "gpu")]
fn go_through_gpu_bench_groups(val: &str) {
match val.to_lowercase().as_str() {
"default" => {
default_cuda_ops();
default_scalar_cuda_ops()
}
"unchecked" => {
unchecked_cuda_ops();
unchecked_scalar_cuda_ops()
}
_ => panic!("unknown benchmark operations flavor"),
};
}
fn go_through_cpu_bench_groups(val: &str) {
match val.to_lowercase().as_str() {
"default" => {
default_parallelized_ops();
default_parallelized_ops_comp();
default_scalar_parallelized_ops();
default_scalar_parallelized_ops_comp()
}
"smart" => {
smart_ops();
smart_ops_comp();
smart_scalar_ops();
smart_parallelized_ops();
smart_parallelized_ops_comp();
smart_scalar_parallelized_ops();
smart_scalar_parallelized_ops_comp()
}
"unchecked" => {
unchecked_ops();
unchecked_parallelized_ops();
unchecked_ops_comp();
unchecked_scalar_ops();
unchecked_scalar_ops_comp()
}
"misc" => misc(),
_ => panic!("unknown benchmark operations flavor"),
};
}
fn main() {
match env::var("__TFHE_RS_BENCH_OP_FLAVOR") {
Ok(val) => {
match val.to_lowercase().as_str() {
"default" => {
default_parallelized_ops();
default_parallelized_ops_comp();
default_scalar_parallelized_ops();
default_scalar_parallelized_ops_comp()
}
"smart" => {
smart_ops();
smart_ops_comp();
smart_scalar_ops();
smart_parallelized_ops();
smart_parallelized_ops_comp();
smart_scalar_parallelized_ops();
smart_scalar_parallelized_ops_comp()
}
"unchecked" => {
unchecked_ops();
unchecked_parallelized_ops();
unchecked_ops_comp();
unchecked_scalar_ops();
unchecked_scalar_ops_comp()
}
"misc" => misc(),
_ => panic!("unknown benchmark operations flavor"),
};
#[cfg(feature = "gpu")]
go_through_gpu_bench_groups(&val);
#[cfg(not(feature = "gpu"))]
go_through_cpu_bench_groups(&val);
}
Err(_) => {
default_parallelized_ops();

View File

@@ -1,6 +1,194 @@
use super::utils::*;
use std::os::raw::c_int;
#[no_mangle]
pub unsafe extern "C" fn core_crypto_lwe_secret_key(
output_lwe_sk_ptr: *mut u64,
lwe_sk_dim: usize,
seed_low_bytes: u64,
seed_high_bytes: u64,
) -> c_int {
catch_panic(|| {
use crate::core_crypto::commons::math::random::Seed;
use crate::core_crypto::prelude::*;
let seed_low_bytes: u128 = seed_low_bytes.into();
let seed_high_bytes: u128 = seed_high_bytes.into();
let seed = (seed_high_bytes << 64) | seed_low_bytes;
let mut secret_generator =
SecretRandomGenerator::<ActivatedRandomGenerator>::new(Seed(seed));
// Create the LweSecretKey
let output_lwe_sk_slice = std::slice::from_raw_parts_mut(output_lwe_sk_ptr, lwe_sk_dim);
let mut lwe_sk = LweSecretKey::from_container(output_lwe_sk_slice);
generate_binary_lwe_secret_key(&mut lwe_sk, &mut secret_generator);
})
}
#[no_mangle]
pub unsafe extern "C" fn core_crypto_lwe_encrypt(
output_ct_ptr: *mut u64,
pt: u64,
lwe_sk_ptr: *const u64,
lwe_sk_dim: usize,
lwe_encryption_std_dev: f64,
seed_low_bytes: u64,
seed_high_bytes: u64,
) -> c_int {
catch_panic(|| {
use crate::core_crypto::commons::generators::DeterministicSeeder;
use crate::core_crypto::commons::math::random::Seed;
use crate::core_crypto::prelude::*;
let lwe_sk_slice = std::slice::from_raw_parts(lwe_sk_ptr, lwe_sk_dim);
let lwe_sk = LweSecretKey::from_container(lwe_sk_slice);
let seed_low_bytes: u128 = seed_low_bytes.into();
let seed_high_bytes: u128 = seed_high_bytes.into();
let seed = (seed_high_bytes << 64) | seed_low_bytes;
let seed = Seed(seed);
let mut determinisitic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
let mut encryption_generator = EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
determinisitic_seeder.seed(),
&mut determinisitic_seeder,
);
let plaintext = Plaintext(pt);
let output_ct = std::slice::from_raw_parts_mut(output_ct_ptr, lwe_sk_dim + 1);
let mut ct = LweCiphertext::from_container(output_ct, CiphertextModulus::new_native());
let lwe_encryption_std_dev = StandardDev(lwe_encryption_std_dev);
encrypt_lwe_ciphertext(
&lwe_sk,
&mut ct,
plaintext,
lwe_encryption_std_dev,
&mut encryption_generator,
);
})
}
#[no_mangle]
pub unsafe extern "C" fn core_crypto_ggsw_encrypt(
output_ct_ptr: *mut u64,
pt: u64,
glwe_sk_ptr: *const u64,
glwe_sk_dim: usize,
poly_size: usize,
level_count: usize,
base_log: usize,
glwe_modular_variance: f64,
seed_low_bytes: u64,
seed_high_bytes: u64,
) -> c_int {
catch_panic(|| {
use crate::core_crypto::commons::generators::DeterministicSeeder;
use crate::core_crypto::commons::math::random::Seed;
use crate::core_crypto::prelude::*;
let glwe_sk_slice = std::slice::from_raw_parts(glwe_sk_ptr, glwe_sk_dim * poly_size);
let glwe_sk = GlweSecretKey::from_container(glwe_sk_slice, PolynomialSize(poly_size));
let seed_low_bytes: u128 = seed_low_bytes.into();
let seed_high_bytes: u128 = seed_high_bytes.into();
let seed = (seed_high_bytes << 64) | seed_low_bytes;
let seed = Seed(seed);
let mut determinisitic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
let mut encryption_generator = EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
determinisitic_seeder.seed(),
&mut determinisitic_seeder,
);
let plaintext = Plaintext(pt);
let output_ct = std::slice::from_raw_parts_mut(
output_ct_ptr,
ggsw_ciphertext_size(
GlweDimension(glwe_sk_dim).to_glwe_size(),
PolynomialSize(poly_size),
DecompositionLevelCount(level_count),
),
);
let mut ct = GgswCiphertext::from_container(
output_ct,
GlweDimension(glwe_sk_dim).to_glwe_size(),
PolynomialSize(poly_size),
DecompositionBaseLog(base_log),
CiphertextModulus::new_native(),
);
let glwe_encryption_std_dev = StandardDev(glwe_modular_variance);
encrypt_constant_ggsw_ciphertext(
&glwe_sk,
&mut ct,
plaintext,
glwe_encryption_std_dev,
&mut encryption_generator,
);
})
}
#[no_mangle]
pub unsafe extern "C" fn core_crypto_lwe_decrypt(
output_pt: *mut u64,
input_ct_ptr: *const u64,
lwe_sk_ptr: *const u64,
lwe_sk_dim: usize,
) -> c_int {
catch_panic(|| {
use crate::core_crypto::prelude::*;
let lwe_sk_slice = std::slice::from_raw_parts(lwe_sk_ptr, lwe_sk_dim);
let lwe_sk = LweSecretKey::from_container(lwe_sk_slice);
let input_ct = std::slice::from_raw_parts(input_ct_ptr, lwe_sk_dim + 1);
let ct = LweCiphertext::from_container(input_ct, CiphertextModulus::new_native());
let plaintext = decrypt_lwe_ciphertext(&lwe_sk, &ct);
*output_pt = plaintext.0;
})
}
#[no_mangle]
pub unsafe extern "C" fn core_crypto_glwe_decrypt(
output_pt: *mut u64,
input_ct_ptr: *const u64,
glwe_sk_ptr: *const u64,
glwe_sk_dim: usize,
glwe_poly_size: usize,
) -> c_int {
catch_panic(|| {
use crate::core_crypto::prelude::*;
let glwe_sk_slice = std::slice::from_raw_parts(glwe_sk_ptr, glwe_sk_dim * glwe_poly_size);
let glwe_sk = GlweSecretKey::from_container(glwe_sk_slice, PolynomialSize(glwe_poly_size));
let input_ct = std::slice::from_raw_parts(
input_ct_ptr,
glwe_ciphertext_size(
GlweDimension(glwe_sk_dim).to_glwe_size(),
PolynomialSize(glwe_poly_size),
),
);
let ct = GlweCiphertext::from_container(
input_ct,
PolynomialSize(glwe_poly_size),
CiphertextModulus::new_native(),
);
let output = std::slice::from_raw_parts_mut(output_pt, glwe_poly_size);
let mut plaintext_list = PlaintextList::from_container(output);
decrypt_glwe_ciphertext(&glwe_sk, &ct, &mut plaintext_list);
})
}
#[no_mangle]
pub unsafe extern "C" fn core_crypto_lwe_multi_bit_bootstrapping_key_element_size(
input_lwe_sk_dim: usize,
@@ -34,6 +222,88 @@ pub unsafe extern "C" fn core_crypto_lwe_multi_bit_bootstrapping_key_element_siz
})
}
#[no_mangle]
pub unsafe extern "C" fn core_crypto_par_generate_lwe_bootstrapping_key(
output_bsk_ptr: *mut u64,
bsk_base_log: usize,
bsk_level_count: usize,
input_lwe_sk_ptr: *const u64,
input_lwe_sk_dim: usize,
output_glwe_sk_ptr: *const u64,
output_glwe_sk_dim: usize,
output_glwe_sk_poly_size: usize,
glwe_encryption_std_dev: f64,
seed_low_bytes: u64,
seed_high_bytes: u64,
) -> c_int {
catch_panic(|| {
use crate::core_crypto::commons::generators::DeterministicSeeder;
use crate::core_crypto::commons::math::random::Seed;
use crate::core_crypto::prelude::*;
let input_lwe_sk_slice = std::slice::from_raw_parts(input_lwe_sk_ptr, input_lwe_sk_dim);
let input_lwe_sk = LweSecretKey::from_container(input_lwe_sk_slice);
let output_glwe_sk_dim = GlweDimension(output_glwe_sk_dim);
let output_glwe_sk_poly_size = PolynomialSize(output_glwe_sk_poly_size);
let output_glwe_sk_size =
glwe_ciphertext_mask_size(output_glwe_sk_dim, output_glwe_sk_poly_size);
let output_glwe_sk_slice =
std::slice::from_raw_parts(output_glwe_sk_ptr, output_glwe_sk_size);
let output_glwe_sk =
GlweSecretKey::from_container(output_glwe_sk_slice, output_glwe_sk_poly_size);
let seed_low_bytes: u128 = seed_low_bytes.into();
let seed_high_bytes: u128 = seed_high_bytes.into();
let seed = (seed_high_bytes << 64) | seed_low_bytes;
let mut deterministic_seeder =
DeterministicSeeder::<ActivatedRandomGenerator>::new(Seed(seed));
let mut encryption_random_generator =
EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
deterministic_seeder.seed(),
&mut deterministic_seeder,
);
let lwe_base_log = DecompositionBaseLog(bsk_base_log);
let lwe_level_count = DecompositionLevelCount(bsk_level_count);
let lwe_slice_len = {
let bsk = LweBootstrapKeyOwned::new(
0u64,
output_glwe_sk.glwe_dimension().to_glwe_size(),
output_glwe_sk.polynomial_size(),
lwe_base_log,
lwe_level_count,
input_lwe_sk.lwe_dimension(),
CiphertextModulus::new_native(),
);
bsk.into_container().len()
};
let bsk_slice = std::slice::from_raw_parts_mut(output_bsk_ptr, lwe_slice_len);
let mut bsk = LweBootstrapKey::from_container(
bsk_slice,
output_glwe_sk.glwe_dimension().to_glwe_size(),
output_glwe_sk.polynomial_size(),
lwe_base_log,
lwe_level_count,
CiphertextModulus::new_native(),
);
let glwe_encryption_std_dev = StandardDev(glwe_encryption_std_dev);
par_generate_lwe_bootstrap_key(
&input_lwe_sk,
&output_glwe_sk,
&mut bsk,
glwe_encryption_std_dev,
&mut encryption_random_generator,
)
})
}
#[no_mangle]
pub unsafe extern "C" fn core_crypto_par_generate_lwe_multi_bit_bootstrapping_key(
input_lwe_sk_ptr: *const u64,
@@ -120,3 +390,151 @@ pub unsafe extern "C" fn core_crypto_par_generate_lwe_multi_bit_bootstrapping_ke
);
})
}
#[no_mangle]
pub unsafe extern "C" fn core_crypto_par_generate_lwe_keyswitch_key(
output_ksk_ptr: *mut u64,
ksk_base_log: usize,
ksk_level_count: usize,
input_lwe_sk_ptr: *const u64,
input_lwe_sk_dim: usize,
output_lwe_sk_ptr: *const u64,
output_lwe_sk_dim: usize,
lwe_encryption_std_dev: f64,
seed_low_bytes: u64,
seed_high_bytes: u64,
) -> c_int {
catch_panic(|| {
use crate::core_crypto::commons::generators::DeterministicSeeder;
use crate::core_crypto::commons::math::random::Seed;
use crate::core_crypto::prelude::*;
let input_lwe_sk_slice = std::slice::from_raw_parts(input_lwe_sk_ptr, input_lwe_sk_dim);
let input_lwe_sk = LweSecretKey::from_container(input_lwe_sk_slice);
let output_lwe_sk_slice = std::slice::from_raw_parts(output_lwe_sk_ptr, output_lwe_sk_dim);
let output_lwe_sk = LweSecretKey::from_container(output_lwe_sk_slice);
let seed_low_bytes: u128 = seed_low_bytes.into();
let seed_high_bytes: u128 = seed_high_bytes.into();
let seed = (seed_high_bytes << 64) | seed_low_bytes;
let mut deterministic_seeder =
DeterministicSeeder::<ActivatedRandomGenerator>::new(Seed(seed));
let mut encryption_random_generator =
EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
deterministic_seeder.seed(),
&mut deterministic_seeder,
);
let lwe_base_log = DecompositionBaseLog(ksk_base_log);
let lwe_level_count = DecompositionLevelCount(ksk_level_count);
let lwe_slice_len = {
let bsk = LweKeyswitchKeyOwned::new(
0u64,
lwe_base_log,
lwe_level_count,
LweDimension(input_lwe_sk_dim),
LweDimension(output_lwe_sk_dim),
CiphertextModulus::new_native(),
);
bsk.into_container().len()
};
let ksk_slice = std::slice::from_raw_parts_mut(output_ksk_ptr, lwe_slice_len);
let mut ksk = LweKeyswitchKey::from_container(
ksk_slice,
lwe_base_log,
lwe_level_count,
LweDimension(output_lwe_sk_dim).to_lwe_size(),
CiphertextModulus::new_native(),
);
let lwe_encryption_std_dev = StandardDev(lwe_encryption_std_dev);
generate_lwe_keyswitch_key(
&input_lwe_sk,
&output_lwe_sk,
&mut ksk,
lwe_encryption_std_dev,
&mut encryption_random_generator,
)
})
}
#[no_mangle]
pub unsafe extern "C" fn core_crypto_par_generate_lwe_private_functional_keyswitch_key(
output_pksk_ptr: *mut u64,
pksk_base_log: usize,
pksk_level_count: usize,
input_lwe_sk_ptr: *const u64,
input_lwe_sk_dim: usize,
output_glwe_sk_ptr: *const u64,
poly_size: usize,
glwe_dim: usize,
lwe_encryption_std_dev: f64,
seed_low_bytes: u64,
seed_high_bytes: u64,
) -> c_int {
catch_panic(|| {
use crate::core_crypto::commons::generators::DeterministicSeeder;
use crate::core_crypto::commons::math::random::Seed;
use crate::core_crypto::prelude::*;
let input_lwe_sk_slice = std::slice::from_raw_parts(input_lwe_sk_ptr, input_lwe_sk_dim);
let input_lwe_sk = LweSecretKey::from_container(input_lwe_sk_slice);
let output_glwe_sk_slice =
std::slice::from_raw_parts(output_glwe_sk_ptr, glwe_dim * poly_size);
let output_glwe_sk =
GlweSecretKey::from_container(output_glwe_sk_slice, PolynomialSize(poly_size));
let seed_low_bytes: u128 = seed_low_bytes.into();
let seed_high_bytes: u128 = seed_high_bytes.into();
let seed = (seed_high_bytes << 64) | seed_low_bytes;
let mut deterministic_seeder =
DeterministicSeeder::<ActivatedRandomGenerator>::new(Seed(seed));
let mut encryption_random_generator =
EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
deterministic_seeder.seed(),
&mut deterministic_seeder,
);
let pksk_len = {
let ksk = LwePrivateFunctionalPackingKeyswitchKeyList::new(
0u64,
DecompositionBaseLog(pksk_base_log),
DecompositionLevelCount(pksk_level_count),
LweDimension(input_lwe_sk_dim),
GlweDimension(glwe_dim).to_glwe_size(),
PolynomialSize(poly_size),
FunctionalPackingKeyswitchKeyCount(glwe_dim + 1),
CiphertextModulus::new_native(),
);
ksk.into_container().len()
};
let ksk_slice = std::slice::from_raw_parts_mut(output_pksk_ptr, pksk_len);
let mut fp_ksk = LwePrivateFunctionalPackingKeyswitchKeyList::from_container(
ksk_slice,
DecompositionBaseLog(pksk_base_log),
DecompositionLevelCount(pksk_level_count),
LweDimension(input_lwe_sk_dim).to_lwe_size(),
GlweDimension(glwe_dim).to_glwe_size(),
PolynomialSize(poly_size),
CiphertextModulus::new_native(),
);
let lwe_encryption_std_dev = StandardDev(lwe_encryption_std_dev);
generate_circuit_bootstrap_lwe_pfpksk_list(
&mut fp_ksk,
&input_lwe_sk,
&output_glwe_sk,
lwe_encryption_std_dev,
&mut encryption_random_generator,
)
})
}

View File

@@ -332,7 +332,7 @@ pub fn round_decode<Scalar: UnsignedInteger>(decrypted: Scalar, delta: Scalar) -
}
// Here we will define a helper function to generate an accumulator for a PBS
fn generate_accumulator<F, Scalar: UnsignedTorus + CastFrom<usize>>(
pub(crate) fn generate_accumulator<F, Scalar: UnsignedTorus + CastFrom<usize>>(
polynomial_size: PolynomialSize,
glwe_size: GlweSize,
message_modulus: usize,

View File

@@ -23,6 +23,11 @@ pub struct CiphertextCount(pub usize);
#[derive(Copy, Clone, Eq, PartialEq, Debug, Serialize, Deserialize)]
pub struct LweCiphertextCount(pub usize);
/// The index of a ciphertext in an lwe ciphertext list.
#[cfg(feature = "gpu")]
#[derive(Copy, Clone, Eq, PartialEq, Debug, Serialize, Deserialize)]
pub struct LweCiphertextIndex(pub usize);
/// The number of ciphertexts in a glwe ciphertext list.
#[derive(Copy, Clone, Eq, PartialEq, Debug, Serialize, Deserialize)]
pub struct GlweCiphertextCount(pub usize);

View File

@@ -0,0 +1,73 @@
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::lwe_keyswitch_key::CudaLweKeyswitchKey;
use crate::core_crypto::gpu::vec::CudaVec;
use crate::core_crypto::gpu::CudaStream;
use crate::core_crypto::prelude::UnsignedInteger;
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
pub unsafe fn cuda_keyswitch_lwe_ciphertext_async<Scalar>(
lwe_keyswitch_key: &CudaLweKeyswitchKey<Scalar>,
input_lwe_ciphertext: &CudaLweCiphertextList<Scalar>,
output_lwe_ciphertext: &mut CudaLweCiphertextList<Scalar>,
input_indexes: &CudaVec<Scalar>,
output_indexes: &CudaVec<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
assert!(
lwe_keyswitch_key.input_key_lwe_size().to_lwe_dimension()
== input_lwe_ciphertext.lwe_dimension(),
"Mismatched input LweDimension. \
LweKeyswitchKey input LweDimension: {:?}, input LweCiphertext LweDimension {:?}.",
lwe_keyswitch_key.input_key_lwe_size().to_lwe_dimension(),
input_lwe_ciphertext.lwe_dimension(),
);
assert!(
lwe_keyswitch_key.output_key_lwe_size().to_lwe_dimension()
== output_lwe_ciphertext.lwe_dimension(),
"Mismatched output LweDimension. \
LweKeyswitchKey output LweDimension: {:?}, output LweCiphertext LweDimension {:?}.",
lwe_keyswitch_key.output_key_lwe_size().to_lwe_dimension(),
output_lwe_ciphertext.lwe_dimension(),
);
stream.keyswitch_async(
&mut output_lwe_ciphertext.0.d_vec,
output_indexes,
&input_lwe_ciphertext.0.d_vec,
input_indexes,
lwe_keyswitch_key.input_key_lwe_size().to_lwe_dimension(),
lwe_keyswitch_key.output_key_lwe_size().to_lwe_dimension(),
&lwe_keyswitch_key.d_vec,
lwe_keyswitch_key.decomposition_base_log(),
lwe_keyswitch_key.decomposition_level_count(),
input_lwe_ciphertext.lwe_ciphertext_count().0 as u32,
);
}
pub fn cuda_keyswitch_lwe_ciphertext<Scalar>(
lwe_keyswitch_key: &CudaLweKeyswitchKey<Scalar>,
input_lwe_ciphertext: &CudaLweCiphertextList<Scalar>,
output_lwe_ciphertext: &mut CudaLweCiphertextList<Scalar>,
input_indexes: &CudaVec<Scalar>,
output_indexes: &CudaVec<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
unsafe {
cuda_keyswitch_lwe_ciphertext_async(
lwe_keyswitch_key,
input_lwe_ciphertext,
output_lwe_ciphertext,
input_indexes,
output_indexes,
stream,
);
}
stream.synchronize();
}

View File

@@ -0,0 +1,361 @@
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::vec::CudaVec;
use crate::core_crypto::gpu::CudaStream;
use crate::core_crypto::prelude::UnsignedInteger;
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
pub unsafe fn cuda_lwe_ciphertext_add_async<Scalar>(
output: &mut CudaLweCiphertextList<Scalar>,
lhs: &CudaLweCiphertextList<Scalar>,
rhs: &CudaLweCiphertextList<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
let num_samples = output.lwe_ciphertext_count().0 as u32;
assert_eq!(
lhs.lwe_ciphertext_count(),
rhs.lwe_ciphertext_count(),
"Mismatched number of ciphertexts between lhs ({:?}) and rhs ({:?})",
lhs.lwe_ciphertext_count(),
rhs.lwe_ciphertext_count()
);
assert_eq!(
output.lwe_ciphertext_count(),
rhs.lwe_ciphertext_count(),
"Mismatched number of ciphertexts between output ({:?}) and rhs ({:?})",
output.lwe_ciphertext_count(),
rhs.lwe_ciphertext_count()
);
assert_eq!(
lhs.ciphertext_modulus(),
rhs.ciphertext_modulus(),
"Mismatched moduli between lhs ({:?}) and rhs ({:?}) LweCiphertext",
lhs.ciphertext_modulus(),
rhs.ciphertext_modulus()
);
assert_eq!(
output.ciphertext_modulus(),
rhs.ciphertext_modulus(),
"Mismatched moduli between output ({:?}) and rhs ({:?}) LweCiphertext",
output.ciphertext_modulus(),
rhs.ciphertext_modulus()
);
stream.add_lwe_ciphertext_vector_async(
&mut output.0.d_vec,
&lhs.0.d_vec,
&rhs.0.d_vec,
lhs.lwe_dimension(),
num_samples,
);
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
pub unsafe fn cuda_lwe_ciphertext_add_assign_async<Scalar>(
lhs: &mut CudaLweCiphertextList<Scalar>,
rhs: &CudaLweCiphertextList<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
let num_samples = lhs.lwe_ciphertext_count().0 as u32;
assert_eq!(
lhs.lwe_ciphertext_count(),
rhs.lwe_ciphertext_count(),
"Mismatched number of ciphertexts between lhs ({:?}) and rhs ({:?})",
lhs.lwe_ciphertext_count(),
rhs.lwe_ciphertext_count()
);
assert_eq!(
lhs.ciphertext_modulus(),
rhs.ciphertext_modulus(),
"Mismatched moduli between lhs ({:?}) and rhs ({:?}) LweCiphertext",
lhs.ciphertext_modulus(),
rhs.ciphertext_modulus()
);
stream.add_lwe_ciphertext_vector_assign_async(
&mut lhs.0.d_vec,
&rhs.0.d_vec,
rhs.lwe_dimension(),
num_samples,
);
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
pub unsafe fn cuda_lwe_ciphertext_plaintext_add_async<Scalar>(
output: &mut CudaLweCiphertextList<Scalar>,
lhs: &CudaLweCiphertextList<Scalar>,
rhs: &CudaVec<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
let num_samples = output.lwe_ciphertext_count().0 as u32;
assert_eq!(
output.lwe_ciphertext_count(),
lhs.lwe_ciphertext_count(),
"Mismatched number of ciphertexts between output ({:?}) and lhs ({:?})",
output.lwe_ciphertext_count(),
lhs.lwe_ciphertext_count()
);
assert_eq!(
output.ciphertext_modulus(),
lhs.ciphertext_modulus(),
"Mismatched moduli between output ({:?}) and lhs ({:?}) LweCiphertext",
output.ciphertext_modulus(),
lhs.ciphertext_modulus()
);
stream.add_lwe_ciphertext_vector_plaintext_vector_async(
&mut output.0.d_vec,
&lhs.0.d_vec,
rhs,
lhs.lwe_dimension(),
num_samples,
);
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
pub unsafe fn cuda_lwe_ciphertext_plaintext_add_assign_async<Scalar>(
lhs: &mut CudaLweCiphertextList<Scalar>,
rhs: &CudaVec<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
let num_samples = lhs.lwe_ciphertext_count().0 as u32;
let lwe_dimension = &lhs.lwe_dimension();
stream.add_lwe_ciphertext_vector_plaintext_vector_assign_async(
&mut lhs.0.d_vec,
rhs,
*lwe_dimension,
num_samples,
);
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
pub unsafe fn cuda_lwe_ciphertext_negate_async<Scalar>(
output: &mut CudaLweCiphertextList<Scalar>,
input: &CudaLweCiphertextList<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
assert_eq!(
input.lwe_ciphertext_count(),
output.lwe_ciphertext_count(),
"Mismatched number of ciphertexts between input ({:?}) and output ({:?})",
input.lwe_ciphertext_count(),
output.lwe_ciphertext_count()
);
let num_samples = output.lwe_ciphertext_count().0 as u32;
let lwe_dimension = &output.lwe_dimension();
stream.negate_lwe_ciphertext_vector_async(
&mut output.0.d_vec,
&input.0.d_vec,
*lwe_dimension,
num_samples,
);
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
pub unsafe fn cuda_lwe_ciphertext_negate_assign_async<Scalar>(
ct: &mut CudaLweCiphertextList<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
let num_samples = ct.lwe_ciphertext_count().0 as u32;
let lwe_dimension = &ct.lwe_dimension();
stream.negate_lwe_ciphertext_vector_assign_async(&mut ct.0.d_vec, *lwe_dimension, num_samples);
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
pub unsafe fn cuda_lwe_ciphertext_cleartext_mul_async<Scalar>(
output: &mut CudaLweCiphertextList<Scalar>,
input: &CudaLweCiphertextList<Scalar>,
cleartext: &CudaVec<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
assert_eq!(
input.lwe_ciphertext_count(),
output.lwe_ciphertext_count(),
"Mismatched number of ciphertexts between input ({:?}) and output ({:?})",
input.lwe_ciphertext_count(),
output.lwe_ciphertext_count()
);
let num_samples = output.lwe_ciphertext_count().0 as u32;
let lwe_dimension = &output.lwe_dimension();
stream.mult_lwe_ciphertext_vector_cleartext_vector(
&mut output.0.d_vec,
&input.0.d_vec,
cleartext,
*lwe_dimension,
num_samples,
);
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
pub unsafe fn cuda_lwe_ciphertext_cleartext_mul_assign_async<Scalar>(
ct: &mut CudaLweCiphertextList<Scalar>,
cleartext: &CudaVec<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
let num_samples = ct.lwe_ciphertext_count().0 as u32;
let lwe_dimension = ct.lwe_dimension();
stream.mult_lwe_ciphertext_vector_cleartext_vector_assign_async(
&mut ct.0.d_vec,
cleartext,
lwe_dimension,
num_samples,
);
}
pub fn cuda_lwe_ciphertext_add<Scalar>(
output: &mut CudaLweCiphertextList<Scalar>,
lhs: &CudaLweCiphertextList<Scalar>,
rhs: &CudaLweCiphertextList<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
unsafe {
cuda_lwe_ciphertext_add_async(output, lhs, rhs, stream);
}
stream.synchronize();
}
pub fn cuda_lwe_ciphertext_add_assign<Scalar>(
lhs: &mut CudaLweCiphertextList<Scalar>,
rhs: &CudaLweCiphertextList<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
unsafe {
cuda_lwe_ciphertext_add_assign_async(lhs, rhs, stream);
}
stream.synchronize();
}
pub fn cuda_lwe_ciphertext_plaintext_add<Scalar>(
output: &mut CudaLweCiphertextList<Scalar>,
lhs: &CudaLweCiphertextList<Scalar>,
rhs: &CudaVec<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
unsafe {
cuda_lwe_ciphertext_plaintext_add_async(output, lhs, rhs, stream);
}
stream.synchronize();
}
pub fn cuda_lwe_ciphertext_plaintext_add_assign<Scalar>(
lhs: &mut CudaLweCiphertextList<Scalar>,
rhs: &CudaVec<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
unsafe {
cuda_lwe_ciphertext_plaintext_add_assign_async(lhs, rhs, stream);
}
stream.synchronize();
}
pub fn cuda_lwe_ciphertext_negate<Scalar>(
output: &mut CudaLweCiphertextList<Scalar>,
input: &CudaLweCiphertextList<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
unsafe {
cuda_lwe_ciphertext_negate_async(output, input, stream);
}
stream.synchronize();
}
pub fn cuda_lwe_ciphertext_negate_assign<Scalar>(
ct: &mut CudaLweCiphertextList<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
unsafe {
cuda_lwe_ciphertext_negate_assign_async(ct, stream);
}
stream.synchronize();
}
pub fn cuda_lwe_ciphertext_cleartext_mul<Scalar>(
output: &mut CudaLweCiphertextList<Scalar>,
input: &CudaLweCiphertextList<Scalar>,
cleartext: &CudaVec<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
unsafe {
cuda_lwe_ciphertext_cleartext_mul_async(output, input, cleartext, stream);
}
stream.synchronize();
}
pub fn cuda_lwe_ciphertext_cleartext_mul_assign<Scalar>(
ct: &mut CudaLweCiphertextList<Scalar>,
cleartext: &CudaVec<Scalar>,
stream: &CudaStream,
) where
Scalar: UnsignedInteger,
{
unsafe {
cuda_lwe_ciphertext_cleartext_mul_assign_async(ct, cleartext, stream);
}
stream.synchronize();
}

View File

@@ -0,0 +1,123 @@
use crate::core_crypto::gpu::entities::glwe_ciphertext_list::CudaGlweCiphertextList;
use crate::core_crypto::gpu::entities::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::entities::lwe_multi_bit_bootstrap_key::CudaLweMultiBitBootstrapKey;
use crate::core_crypto::gpu::vec::CudaVec;
use crate::core_crypto::gpu::CudaStream;
use crate::core_crypto::prelude::{CastInto, LweCiphertextIndex, UnsignedTorus};
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
#[allow(clippy::too_many_arguments)]
pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
input: &CudaLweCiphertextList<Scalar>,
output: &mut CudaLweCiphertextList<Scalar>,
accumulator: &CudaGlweCiphertextList<Scalar>,
lut_indexes: &CudaVec<Scalar>,
output_indexes: &CudaVec<Scalar>,
input_indexes: &CudaVec<Scalar>,
multi_bit_bsk: &CudaLweMultiBitBootstrapKey,
stream: &CudaStream,
) where
// CastInto required for PBS modulus switch which returns a usize
Scalar: UnsignedTorus + CastInto<usize>,
{
assert_eq!(
input.lwe_dimension(),
multi_bit_bsk.input_lwe_dimension(),
"Mimatched input LweDimension. LweCiphertext input LweDimension {:?}. \
FourierLweMultiBitBootstrapKey input LweDimension {:?}.",
input.lwe_dimension(),
multi_bit_bsk.input_lwe_dimension(),
);
assert_eq!(
output.lwe_dimension(),
multi_bit_bsk.output_lwe_dimension(),
"Mimatched output LweDimension. LweCiphertext output LweDimension {:?}. \
FourierLweMultiBitBootstrapKey output LweDimension {:?}.",
output.lwe_dimension(),
multi_bit_bsk.output_lwe_dimension(),
);
assert_eq!(
accumulator.glwe_dimension(),
multi_bit_bsk.glwe_dimension(),
"Mimatched GlweSize. Accumulator GlweSize {:?}. \
FourierLweMultiBitBootstrapKey GlweSize {:?}.",
accumulator.glwe_dimension(),
multi_bit_bsk.glwe_dimension(),
);
assert_eq!(
accumulator.polynomial_size(),
multi_bit_bsk.polynomial_size(),
"Mimatched PolynomialSize. Accumulator PolynomialSize {:?}. \
FourierLweMultiBitBootstrapKey PolynomialSize {:?}.",
accumulator.polynomial_size(),
multi_bit_bsk.polynomial_size(),
);
assert_eq!(
input.ciphertext_modulus(),
output.ciphertext_modulus(),
"Mismatched CiphertextModulus between input ({:?}) and output ({:?})",
input.ciphertext_modulus(),
output.ciphertext_modulus(),
);
assert_eq!(
input.ciphertext_modulus(),
accumulator.ciphertext_modulus(),
"Mismatched CiphertextModulus between input ({:?}) and accumulator ({:?})",
input.ciphertext_modulus(),
accumulator.ciphertext_modulus(),
);
stream.bootstrap_multi_bit_async(
&mut output.0.d_vec,
output_indexes,
&accumulator.0.d_vec,
lut_indexes,
&input.0.d_vec,
input_indexes,
&multi_bit_bsk.d_vec,
input.lwe_dimension(),
multi_bit_bsk.glwe_dimension(),
multi_bit_bsk.polynomial_size(),
multi_bit_bsk.decomp_base_log(),
multi_bit_bsk.decomp_level_count(),
multi_bit_bsk.grouping_factor(),
input.lwe_ciphertext_count().0 as u32,
LweCiphertextIndex(0),
);
}
#[allow(clippy::too_many_arguments)]
pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
input: &CudaLweCiphertextList<Scalar>,
output: &mut CudaLweCiphertextList<Scalar>,
accumulator: &CudaGlweCiphertextList<Scalar>,
lut_indexes: &CudaVec<Scalar>,
output_indexes: &CudaVec<Scalar>,
input_indexes: &CudaVec<Scalar>,
multi_bit_bsk: &CudaLweMultiBitBootstrapKey,
stream: &CudaStream,
) where
// CastInto required for PBS modulus switch which returns a usize
Scalar: UnsignedTorus + CastInto<usize>,
{
unsafe {
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async(
input,
output,
accumulator,
lut_indexes,
output_indexes,
input_indexes,
multi_bit_bsk,
stream,
);
}
}

View File

@@ -0,0 +1,81 @@
use crate::core_crypto::gpu::entities::glwe_ciphertext_list::CudaGlweCiphertextList;
use crate::core_crypto::gpu::entities::lwe_bootstrap_key::CudaLweBootstrapKey;
use crate::core_crypto::gpu::entities::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::vec::CudaVec;
use crate::core_crypto::gpu::CudaStream;
use crate::core_crypto::prelude::{
CastInto, LweCiphertextCount, LweCiphertextIndex, UnsignedTorus,
};
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
#[allow(clippy::too_many_arguments)]
pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
input: &CudaLweCiphertextList<Scalar>,
output: &mut CudaLweCiphertextList<Scalar>,
accumulator: &CudaGlweCiphertextList<Scalar>,
lut_indexes: &CudaVec<Scalar>,
output_indexes: &CudaVec<Scalar>,
input_indexes: &CudaVec<Scalar>,
num_samples: LweCiphertextCount,
bsk: &CudaLweBootstrapKey,
stream: &CudaStream,
) where
// CastInto required for PBS modulus switch which returns a usize
Scalar: UnsignedTorus + CastInto<usize>,
{
assert_eq!(input.ciphertext_modulus(), output.ciphertext_modulus());
assert_eq!(
output.ciphertext_modulus(),
accumulator.ciphertext_modulus()
);
stream.bootstrap_low_latency_async(
&mut output.0.d_vec,
output_indexes,
&accumulator.0.d_vec,
lut_indexes,
&input.0.d_vec,
input_indexes,
&bsk.d_vec,
input.lwe_dimension(),
bsk.glwe_dimension(),
bsk.polynomial_size(),
bsk.decomp_base_log(),
bsk.decomp_level_count(),
num_samples.0 as u32,
LweCiphertextIndex(0),
);
}
#[allow(clippy::too_many_arguments)]
pub fn cuda_programmable_bootstrap_lwe_ciphertext<Scalar>(
input: &CudaLweCiphertextList<Scalar>,
output: &mut CudaLweCiphertextList<Scalar>,
accumulator: &CudaGlweCiphertextList<Scalar>,
lut_indexes: &CudaVec<Scalar>,
output_indexes: &CudaVec<Scalar>,
input_indexes: &CudaVec<Scalar>,
num_samples: LweCiphertextCount,
bsk: &CudaLweBootstrapKey,
stream: &CudaStream,
) where
// CastInto required for PBS modulus switch which returns a usize
Scalar: UnsignedTorus + CastInto<usize>,
{
unsafe {
cuda_programmable_bootstrap_lwe_ciphertext_async(
input,
output,
accumulator,
lut_indexes,
output_indexes,
input_indexes,
num_samples,
bsk,
stream,
);
}
}

View File

@@ -0,0 +1,12 @@
pub mod lwe_linear_algebra;
pub mod lwe_multi_bit_programmable_bootstrapping;
pub mod lwe_programmable_bootstrapping;
mod lwe_keyswitch;
#[cfg(test)]
mod test;
pub use lwe_keyswitch::*;
pub use lwe_linear_algebra::*;
pub use lwe_multi_bit_programmable_bootstrapping::*;
pub use lwe_programmable_bootstrapping::*;

View File

@@ -0,0 +1,123 @@
use super::*;
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::lwe_keyswitch_key::CudaLweKeyswitchKey;
use crate::core_crypto::gpu::{cuda_keyswitch_lwe_ciphertext, CudaDevice, CudaStream};
use itertools::Itertools;
fn lwe_encrypt_ks_decrypt_custom_mod<Scalar: UnsignedTorus + CastFrom<usize>>(
params: ClassicTestParams<Scalar>,
) {
let lwe_dimension = params.lwe_dimension;
let lwe_modular_std_dev = params.lwe_modular_std_dev;
let ciphertext_modulus = params.ciphertext_modulus;
let message_modulus_log = params.message_modulus_log;
let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
let glwe_dimension = params.glwe_dimension;
let polynomial_size = params.polynomial_size;
let ks_decomp_base_log = params.ks_base_log;
let ks_decomp_level_count = params.ks_level;
let gpu_index = 0;
let device = CudaDevice::new(gpu_index);
let stream = CudaStream::new_unchecked(device);
let mut rsc = TestResources::new();
const NB_TESTS: usize = 10;
let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
let mut msg = msg_modulus;
let delta: Scalar = encoding_with_padding / msg_modulus;
while msg != Scalar::ZERO {
msg = msg.wrapping_sub(Scalar::ONE);
for _ in 0..NB_TESTS {
let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key(
lwe_dimension,
&mut rsc.secret_random_generator,
);
let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
glwe_dimension,
polynomial_size,
&mut rsc.secret_random_generator,
);
let big_lwe_sk = glwe_sk.into_lwe_secret_key();
let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
&big_lwe_sk,
&lwe_sk,
ks_decomp_base_log,
ks_decomp_level_count,
lwe_modular_std_dev,
ciphertext_modulus,
&mut rsc.encryption_random_generator,
);
assert!(check_encrypted_content_respects_mod(
&ksk_big_to_small,
ciphertext_modulus
));
let d_ksk_big_to_small =
CudaLweKeyswitchKey::from_lwe_keyswitch_key(&ksk_big_to_small, &stream);
let plaintext = Plaintext(msg * delta);
let ct = allocate_and_encrypt_new_lwe_ciphertext(
&big_lwe_sk,
plaintext,
lwe_modular_std_dev,
ciphertext_modulus,
&mut rsc.encryption_random_generator,
);
assert!(check_encrypted_content_respects_mod(
&ct,
ciphertext_modulus
));
let d_ct = CudaLweCiphertextList::from_lwe_ciphertext(&ct, &stream);
let mut d_output_ct = CudaLweCiphertextList::new(
ksk_big_to_small.output_key_lwe_dimension(),
LweCiphertextCount(1),
ciphertext_modulus,
&stream,
);
let num_blocks = d_ct.0.lwe_ciphertext_count.0;
let lwe_indexes_usize = (0..num_blocks).collect_vec();
let lwe_indexes = lwe_indexes_usize
.iter()
.map(|&x| <usize as CastInto<Scalar>>::cast_into(x))
.collect_vec();
let mut d_input_indexes = stream.malloc_async::<Scalar>(num_blocks as u32);
let mut d_output_indexes = stream.malloc_async::<Scalar>(num_blocks as u32);
stream.copy_to_gpu_async(&mut d_input_indexes, &lwe_indexes);
stream.copy_to_gpu_async(&mut d_output_indexes, &lwe_indexes);
cuda_keyswitch_lwe_ciphertext(
&d_ksk_big_to_small,
&d_ct,
&mut d_output_ct,
&d_input_indexes,
&d_output_indexes,
&stream,
);
let output_ct = d_output_ct.into_lwe_ciphertext(&stream);
assert!(check_encrypted_content_respects_mod(
&output_ct,
ciphertext_modulus
));
let decrypted = decrypt_lwe_ciphertext(&lwe_sk, &output_ct);
let decoded = round_decode(decrypted.0, delta) % msg_modulus;
assert_eq!(msg, decoded);
}
}
}
create_gpu_parametrized_test!(lwe_encrypt_ks_decrypt_custom_mod);

View File

@@ -0,0 +1,78 @@
use super::*;
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::{cuda_lwe_ciphertext_add_assign, CudaDevice, CudaStream};
fn lwe_encrypt_add_assign_decrypt_custom_mod<Scalar: UnsignedTorus>(
params: ClassicTestParams<Scalar>,
) {
let lwe_dimension = params.lwe_dimension;
let lwe_modular_std_dev = params.lwe_modular_std_dev;
let ciphertext_modulus = params.ciphertext_modulus;
let message_modulus_log = params.message_modulus_log;
let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
let gpu_index = 0;
let device = CudaDevice::new(gpu_index);
let stream = CudaStream::new_unchecked(device);
let mut rsc = TestResources::new();
const NB_TESTS: usize = 10;
let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
let mut msg = msg_modulus;
let delta: Scalar = encoding_with_padding / msg_modulus;
while msg != Scalar::ZERO {
msg = msg.wrapping_sub(Scalar::ONE);
for _ in 0..NB_TESTS {
let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key(
lwe_dimension,
&mut rsc.secret_random_generator,
);
let mut ct = LweCiphertext::new(
Scalar::ZERO,
lwe_dimension.to_lwe_size(),
ciphertext_modulus,
);
let plaintext = Plaintext(msg * delta);
encrypt_lwe_ciphertext(
&lwe_sk,
&mut ct,
plaintext,
lwe_modular_std_dev,
&mut rsc.encryption_random_generator,
);
assert!(check_encrypted_content_respects_mod(
&ct,
ciphertext_modulus
));
let rhs = ct.clone();
// Convert to CUDA objects
let mut d_ct = CudaLweCiphertextList::from_lwe_ciphertext(&ct, &stream);
let d_rhs = CudaLweCiphertextList::from_lwe_ciphertext(&rhs, &stream);
cuda_lwe_ciphertext_add_assign(&mut d_ct, &d_rhs, &stream);
let output = d_ct.into_lwe_ciphertext(&stream);
assert!(check_encrypted_content_respects_mod(
&output,
ciphertext_modulus
));
let decrypted = decrypt_lwe_ciphertext(&lwe_sk, &output);
let decoded = round_decode(decrypted.0, delta) % msg_modulus;
assert_eq!((msg + msg) % msg_modulus, decoded);
}
}
}
create_gpu_parametrized_test!(lwe_encrypt_add_assign_decrypt_custom_mod);

Some files were not shown because too many files have changed in this diff Show More