mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 15:48:20 -05:00
Compare commits
54 Commits
gpu/hypers
...
dt/ci/cove
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
146aaf662a | ||
|
|
c28c8beb69 | ||
|
|
08fd31f8fc | ||
|
|
ba18519946 | ||
|
|
ab8a51bc0d | ||
|
|
9ecb9f416e | ||
|
|
0618b9422c | ||
|
|
5bb3af30b6 | ||
|
|
ef1607db51 | ||
|
|
d2daf449b7 | ||
|
|
826c80ea6d | ||
|
|
bc991e1946 | ||
|
|
f88e309503 | ||
|
|
cbc23c08a2 | ||
|
|
e9ee813911 | ||
|
|
3ebc114c85 | ||
|
|
effe6a9ec8 | ||
|
|
a8fe4d9f38 | ||
|
|
af7c1e14a4 | ||
|
|
85fbca62a2 | ||
|
|
9c36ee477b | ||
|
|
9d70e695b5 | ||
|
|
33e7cfea2d | ||
|
|
ce0aea171b | ||
|
|
083c068144 | ||
|
|
8912e887e3 | ||
|
|
ddd0f2ce79 | ||
|
|
aeab9e851b | ||
|
|
919f07f911 | ||
|
|
6199e04ce0 | ||
|
|
77a6673d59 | ||
|
|
631dd09445 | ||
|
|
a7f6ea12ce | ||
|
|
0a1cba43c4 | ||
|
|
666926e4b0 | ||
|
|
105e2488f6 | ||
|
|
cf62937632 | ||
|
|
c0985707ad | ||
|
|
2cc20a6bd5 | ||
|
|
28db75b3a3 | ||
|
|
6f31bffef6 | ||
|
|
9e6b251e68 | ||
|
|
da46d26f1b | ||
|
|
b4662607bf | ||
|
|
faac5e10e6 | ||
|
|
0fa5231e5a | ||
|
|
9f36cf7120 | ||
|
|
a654a47136 | ||
|
|
29dfac320c | ||
|
|
af11862dc1 | ||
|
|
1b5745c22f | ||
|
|
e6970fc00d | ||
|
|
cc4b7dc33e | ||
|
|
5417ba53be |
4
.github/workflows/aws_tfhe_fast_tests.yml
vendored
4
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -45,14 +45,14 @@ jobs:
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
@@ -29,10 +29,10 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
68
.github/workflows/aws_tfhe_gpu_tests.yml
vendored
68
.github/workflows/aws_tfhe_gpu_tests.yml
vendored
@@ -36,8 +36,8 @@ jobs:
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: gpu-test
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA tests
|
||||
cuda-pcc:
|
||||
name: CUDA post-commit checks
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
@@ -56,14 +56,14 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -94,6 +94,64 @@ jobs:
|
||||
run: |
|
||||
make pcc_gpu
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA tests
|
||||
needs: [ setup-ec2, cuda-pcc ]
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@1d96c772d19495a3b5c517cd2bc0cb401ea0529f
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run core crypto, integer and internal CUDA backend tests
|
||||
run: |
|
||||
make test_gpu
|
||||
@@ -121,7 +179,7 @@ jobs:
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (cuda-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, cuda-tests-linux ]
|
||||
needs: [ setup-ec2, cuda-pcc, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
|
||||
4
.github/workflows/aws_tfhe_integer_tests.yml
vendored
4
.github/workflows/aws_tfhe_integer_tests.yml
vendored
@@ -46,14 +46,14 @@ jobs:
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
@@ -46,14 +46,14 @@ jobs:
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
4
.github/workflows/aws_tfhe_tests.yml
vendored
4
.github/workflows/aws_tfhe_tests.yml
vendored
@@ -46,14 +46,14 @@ jobs:
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
4
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
4
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
@@ -46,14 +46,14 @@ jobs:
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
8
.github/workflows/boolean_benchmark.yml
vendored
8
.github/workflows/boolean_benchmark.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -97,13 +97,13 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_boolean
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
2
.github/workflows/cargo_build.yml
vendored
2
.github/workflows/cargo_build.yml
vendored
@@ -23,7 +23,7 @@ jobs:
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
- uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
|
||||
- name: Install and run newline linter checks
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
|
||||
2
.github/workflows/ci_lint.yml
vendored
2
.github/workflows/ci_lint.yml
vendored
@@ -13,7 +13,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
|
||||
- name: Get actionlint
|
||||
run: |
|
||||
|
||||
117
.github/workflows/code_coverage.yml
vendored
117
.github/workflows/code_coverage.yml
vendored
@@ -6,70 +6,58 @@ env:
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: 'Slab request ID'
|
||||
type: string
|
||||
fork_repo:
|
||||
description: 'Name of forked repo as user/repo'
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: 'Git SHA to checkout from fork'
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (code-coverage)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: cpu-small
|
||||
|
||||
code-coverage:
|
||||
name: Code coverage
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
timeout-minutes: 11520 # 8 days
|
||||
steps:
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "ID: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
echo "Fork repo: ${{ inputs.fork_repo }}"
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@2d756ea4c53f7f6b397767d8723b3a10a9f35bf2
|
||||
uses: tj-actions/changed-files@0874344d6ebbaa00a27da73276ae7162fadcaf69
|
||||
with:
|
||||
files_yaml: |
|
||||
tfhe:
|
||||
@@ -78,29 +66,29 @@ jobs:
|
||||
- concrete-csprng/src/**
|
||||
|
||||
- name: Generate Keys
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
# if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
run: |
|
||||
make GEN_KEY_CACHE_COVERAGE_ONLY=TRUE gen_key_cache
|
||||
make gen_key_cache_core_crypto
|
||||
|
||||
- name: Run coverage for core_crypto
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
# if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
run: |
|
||||
make test_core_crypto_cov AVX512_SUPPORT=ON
|
||||
|
||||
- name: Run coverage for boolean
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
# if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
run: |
|
||||
make test_boolean_cov
|
||||
|
||||
- name: Run coverage for shortint
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
# if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
run: |
|
||||
make test_shortint_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@7afa10ed9b269c561c2336fd862446844e0cbf71
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
uses: codecov/codecov-action@84508663e988701840491b86de86b666e8a86bed
|
||||
# if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
directory: ./coverage/
|
||||
@@ -108,13 +96,13 @@ jobs:
|
||||
files: shortint/cobertura.xml,boolean/cobertura.xml,core_crypto/cobertura.xml,core_crypto_avx512/cobertura.xml
|
||||
|
||||
- name: Run integer coverage
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
# if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
run: |
|
||||
make test_integer_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@7afa10ed9b269c561c2336fd862446844e0cbf71
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
uses: codecov/codecov-action@84508663e988701840491b86de86b666e8a86bed
|
||||
# if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
directory: ./coverage/
|
||||
@@ -127,8 +115,29 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Code coverage finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (code-coverage)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, code-coverage ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (code-coverage) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
8
.github/workflows/core_crypto_benchmark.yml
vendored
8
.github/workflows/core_crypto_benchmark.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -88,13 +88,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
@@ -61,7 +61,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -71,7 +71,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -118,13 +118,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
@@ -47,14 +47,14 @@ jobs:
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
18
.github/workflows/gpu_4090_full_benchmark.yml
vendored
18
.github/workflows/gpu_4090_full_benchmark.yml
vendored
@@ -39,7 +39,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -52,12 +52,12 @@ jobs:
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -120,7 +120,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -133,12 +133,12 @@ jobs:
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -163,7 +163,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -194,7 +194,7 @@ jobs:
|
||||
name: Remove 4090 bench label
|
||||
if: ${{ always() && github.event_name == 'pull_request' }}
|
||||
needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
|
||||
runs-on: ["self-hosted", "4090-desktop"]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
|
||||
with:
|
||||
|
||||
10
.github/workflows/integer_benchmark.yml
vendored
10
.github/workflows/integer_benchmark.yml
vendored
@@ -46,7 +46,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -56,7 +56,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -91,13 +91,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
8
.github/workflows/integer_full_benchmark.yml
vendored
8
.github/workflows/integer_full_benchmark.yml
vendored
@@ -74,7 +74,7 @@ jobs:
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -92,12 +92,12 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -121,7 +121,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
10
.github/workflows/integer_gpu_benchmark.yml
vendored
10
.github/workflows/integer_gpu_benchmark.yml
vendored
@@ -56,7 +56,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -66,7 +66,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -100,7 +100,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -122,13 +122,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -82,7 +82,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -107,7 +107,7 @@ jobs:
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -132,7 +132,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
@@ -46,7 +46,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -56,7 +56,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -91,13 +91,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
@@ -57,7 +57,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -67,7 +67,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -101,7 +101,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -123,13 +123,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
11
.github/workflows/m1_tests.yml
vendored
11
.github/workflows/m1_tests.yml
vendored
@@ -31,10 +31,10 @@ jobs:
|
||||
timeout-minutes: 720
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
- uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -86,6 +86,13 @@ jobs:
|
||||
run: |
|
||||
make test_boolean
|
||||
|
||||
# Because we do "illegal" things with the build system which Cargo does not seem to like much
|
||||
# we need to clear the cache to make sure the C API is built properly and does not use a stale
|
||||
# cached version
|
||||
- name: Clear build cache
|
||||
run: |
|
||||
cargo clean
|
||||
|
||||
- name: Run C API tests
|
||||
run: |
|
||||
make test_c_api
|
||||
|
||||
2
.github/workflows/make_release.yml
vendored
2
.github/workflows/make_release.yml
vendored
@@ -30,7 +30,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
|
||||
4
.github/workflows/make_release_cuda.yml
vendored
4
.github/workflows/make_release_cuda.yml
vendored
@@ -55,7 +55,7 @@ jobs:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
4
.github/workflows/parameters_check.yml
vendored
4
.github/workflows/parameters_check.yml
vendored
@@ -17,10 +17,10 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
|
||||
- name: Checkout lattice-estimator
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: malb/lattice-estimator
|
||||
path: lattice_estimator
|
||||
|
||||
8
.github/workflows/shortint_benchmark.yml
vendored
8
.github/workflows/shortint_benchmark.yml
vendored
@@ -45,7 +45,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -55,7 +55,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -89,13 +89,13 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_shortint
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
@@ -53,7 +53,7 @@ jobs:
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -71,12 +71,12 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -115,7 +115,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
10
.github/workflows/signed_integer_benchmark.yml
vendored
10
.github/workflows/signed_integer_benchmark.yml
vendored
@@ -46,7 +46,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -56,7 +56,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -91,13 +91,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
@@ -52,7 +52,7 @@ jobs:
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -70,12 +70,12 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -99,7 +99,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
@@ -46,7 +46,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -56,7 +56,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -91,13 +91,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
6
.github/workflows/start_benchmarks.yml
vendored
6
.github/workflows/start_benchmarks.yml
vendored
@@ -58,13 +58,13 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@2d756ea4c53f7f6b397767d8723b3a10a9f35bf2
|
||||
uses: tj-actions/changed-files@0874344d6ebbaa00a27da73276ae7162fadcaf69
|
||||
with:
|
||||
files_yaml: |
|
||||
common_benches:
|
||||
@@ -111,7 +111,7 @@ jobs:
|
||||
- .github/workflows/wasm_client_benchmark.yml
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
4
.github/workflows/start_full_benchmarks.yml
vendored
4
.github/workflows/start_full_benchmarks.yml
vendored
@@ -30,12 +30,12 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
4
.github/workflows/sync_on_push.yml
vendored
4
.github/workflows/sync_on_push.yml
vendored
@@ -13,11 +13,11 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Save repo
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: repo-archive
|
||||
path: '.'
|
||||
|
||||
8
.github/workflows/wasm_client_benchmark.yml
vendored
8
.github/workflows/wasm_client_benchmark.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -98,13 +98,13 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_wasm
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
37
Makefile
37
Makefile
@@ -271,7 +271,7 @@ clippy_js_wasm_api: install_rs_check_toolchain
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
|
||||
clippy_tasks:
|
||||
clippy_tasks: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
-p tasks -- --no-deps -D warnings
|
||||
|
||||
@@ -281,19 +281,19 @@ clippy_trivium: install_rs_check_toolchain
|
||||
-p tfhe-trivium -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
|
||||
clippy_all_targets:
|
||||
clippy_all_targets: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
|
||||
clippy_concrete_csprng:
|
||||
clippy_concrete_csprng: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--features=$(TARGET_ARCH_FEATURE) \
|
||||
-p concrete-csprng -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
|
||||
clippy_zk_pok:
|
||||
clippy_zk_pok: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-zk-pok -- --no-deps -D warnings
|
||||
|
||||
@@ -376,7 +376,7 @@ build_c_api_gpu: install_rs_check_toolchain
|
||||
.PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
|
||||
build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
|
||||
-p $(TFHE_SPEC)
|
||||
@"$(MAKE)" symlink_c_libs_without_fingerprint
|
||||
|
||||
@@ -424,13 +424,6 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
|
||||
--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache \
|
||||
-p $(TFHE_SPEC) -- core_crypto::
|
||||
@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
|
||||
--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
|
||||
--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,$(AVX512_FEATURE) \
|
||||
-p $(TFHE_SPEC) -- -Z unstable-options --report-time core_crypto::; \
|
||||
fi
|
||||
|
||||
.PHONY: test_cuda_backend # Run the internal tests of the CUDA backend
|
||||
test_cuda_backend:
|
||||
@@ -444,14 +437,14 @@ test_cuda_backend:
|
||||
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
|
||||
|
||||
.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
|
||||
test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
|
||||
test_core_crypto_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
|
||||
|
||||
.PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
|
||||
test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
|
||||
test_integer_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
@@ -479,14 +472,14 @@ test_c_api_rs: install_rs_check_toolchain
|
||||
|
||||
.PHONY: test_c_api_c # Run the C tests for the C API
|
||||
test_c_api_c: build_c_api
|
||||
./scripts/c_api_tests.sh
|
||||
./scripts/c_api_tests.sh --cargo-profile "$(CARGO_PROFILE)"
|
||||
|
||||
.PHONY: test_c_api # Run all the tests for the C API
|
||||
test_c_api: test_c_api_rs test_c_api_c
|
||||
|
||||
.PHONY: test_c_api_gpu # Run the C tests for the C API
|
||||
test_c_api_gpu: build_c_api_gpu
|
||||
./scripts/c_api_tests.sh --gpu
|
||||
./scripts/c_api_tests.sh --gpu --cargo-profile "$(CARGO_PROFILE)"
|
||||
|
||||
.PHONY: test_shortint_ci # Run the tests for shortint ci
|
||||
test_shortint_ci: install_rs_build_toolchain install_cargo_nextest
|
||||
@@ -638,12 +631,12 @@ test_kreyvium: install_rs_build_toolchain
|
||||
-p tfhe-trivium -- --test-threads=1 kreyvium::
|
||||
|
||||
.PHONY: test_concrete_csprng # Run concrete-csprng tests
|
||||
test_concrete_csprng:
|
||||
test_concrete_csprng: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng
|
||||
|
||||
.PHONY: test_zk_pok # Run tfhe-zk-pok-experimental tests
|
||||
test_zk_pok:
|
||||
test_zk_pok: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-zk-pok
|
||||
|
||||
@@ -683,14 +676,14 @@ check_md_docs_are_tested:
|
||||
RUSTFLAGS="" cargo xtask check_tfhe_docs_are_tested
|
||||
|
||||
.PHONY: check_compile_tests # Build tests in debug without running them
|
||||
check_compile_tests:
|
||||
check_compile_tests: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache \
|
||||
-p $(TFHE_SPEC)
|
||||
|
||||
@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
|
||||
"$(MAKE)" build_c_api && \
|
||||
./scripts/c_api_tests.sh --build-only; \
|
||||
./scripts/c_api_tests.sh --build-only --cargo-profile "$(CARGO_PROFILE)"; \
|
||||
fi
|
||||
|
||||
.PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
|
||||
@@ -813,8 +806,6 @@ bench_oprf: install_rs_check_toolchain
|
||||
--bench oprf-integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
|
||||
|
||||
.PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
|
||||
bench_shortint_multi_bit: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
@@ -823,7 +814,6 @@ bench_shortint_multi_bit: install_rs_check_toolchain
|
||||
--bench shortint-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
|
||||
.PHONY: bench_boolean # Run benchmarks for boolean
|
||||
bench_boolean: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
@@ -867,6 +857,7 @@ ci_bench_web_js_api_parallel: build_web_js_api_parallel
|
||||
#
|
||||
# Utility tools
|
||||
#
|
||||
|
||||
.PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
|
||||
gen_key_cache: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
|
||||
|
||||
@@ -71,7 +71,7 @@ fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let config = ConfigBuilder::all_disabled().enable_default_bool().build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
@@ -143,7 +143,7 @@ use tfhe::prelude::*;
|
||||
use tfhe_trivium::TriviumStreamShortint;
|
||||
|
||||
fn test_shortint() {
|
||||
let config = ConfigBuilder::all_disabled().enable_default_integers().build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
|
||||
let ksk = CastingKey::new((&client_key, &server_key), (&hl_client_key, &hl_server_key));
|
||||
|
||||
@@ -13,6 +13,7 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
|
||||
|
||||
[build-dependencies]
|
||||
cmake = { version = "0.1" }
|
||||
pkg-config = { version = "0.3" }
|
||||
|
||||
[dependencies]
|
||||
thiserror = "1.0"
|
||||
|
||||
@@ -21,7 +21,15 @@ fn main() {
|
||||
let dest = cmake::build("cuda");
|
||||
println!("cargo:rustc-link-search=native={}", dest.display());
|
||||
println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
|
||||
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
|
||||
|
||||
// Try to find the cuda libs with pkg-config, default to the path used by the nvidia runfile
|
||||
if pkg_config::Config::new()
|
||||
.atleast_version("10")
|
||||
.probe("cuda")
|
||||
.is_err()
|
||||
{
|
||||
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
|
||||
}
|
||||
println!("cargo:rustc-link-lib=gomp");
|
||||
println!("cargo:rustc-link-lib=cudart");
|
||||
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#define synchronize_threads_in_block() __syncthreads()
|
||||
|
||||
extern "C" {
|
||||
|
||||
#define check_cuda_error(ans) \
|
||||
@@ -57,6 +56,8 @@ void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
bool cuda_check_support_cooperative_groups();
|
||||
|
||||
bool cuda_check_support_thread_block_clusters();
|
||||
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
|
||||
|
||||
@@ -39,6 +39,22 @@ enum COMPARISON_TYPE {
|
||||
enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
|
||||
|
||||
extern "C" {
|
||||
void scratch_cuda_apply_univariate_lut_kb_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, void *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_apply_univariate_lut_kb_64(cuda_stream_t *stream,
|
||||
void *output_radix_lwe,
|
||||
void *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *ksk, void *bsk, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_full_propagation_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
@@ -277,7 +293,23 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_mul(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void);
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *quotient, void *remainder, void *numerator,
|
||||
void *divisor, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t num_blocks_in_radix);
|
||||
|
||||
void cleanup_cuda_integer_div_rem(cuda_stream_t *stream, int8_t **mem_ptr_void);
|
||||
|
||||
} // extern C
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
|
||||
@@ -304,6 +336,11 @@ void generate_device_accumulator_bivariate(
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::function<Torus(Torus, Torus)> f);
|
||||
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator_bivariate_with_factor(
|
||||
cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::function<Torus(Torus, Torus)> f, int factor);
|
||||
/*
|
||||
* generate univariate accumulator (lut) for device pointer
|
||||
* v_stream - cuda stream
|
||||
@@ -966,7 +1003,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
Torus *old_blocks;
|
||||
Torus *small_lwe_vector;
|
||||
int_radix_params params;
|
||||
int_radix_lut<Torus> *luts_message_carry;
|
||||
int_sc_prop_memory<Torus> *scp_mem;
|
||||
|
||||
int32_t *d_smart_copy_in;
|
||||
@@ -979,10 +1015,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
uint32_t max_num_radix_in_vec,
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// create single carry propagation memory object
|
||||
scp_mem = new int_sc_prop_memory<Torus>(stream, params, num_blocks_in_radix,
|
||||
@@ -1002,43 +1034,15 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
(int32_t *)cuda_malloc_async(max_pbs_count * sizeof(int32_t), stream);
|
||||
d_smart_copy_out =
|
||||
(int32_t *)cuda_malloc_async(max_pbs_count * sizeof(int32_t), stream);
|
||||
|
||||
// create lut object for message and carry
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
stream, params, 2, max_pbs_count, allocate_gpu_memory);
|
||||
|
||||
auto message_acc = luts_message_carry->get_lut(0);
|
||||
auto carry_acc = luts_message_carry->get_lut(1);
|
||||
|
||||
// define functions for each accumulator
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(stream, message_acc, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, lut_f_message);
|
||||
generate_device_accumulator<Torus>(stream, carry_acc, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, lut_f_carry);
|
||||
}
|
||||
|
||||
int_sum_ciphertexts_vec_memory(cuda_stream_t *stream, int_radix_params params,
|
||||
uint32_t num_blocks_in_radix,
|
||||
uint32_t max_num_radix_in_vec,
|
||||
Torus *new_blocks, Torus *old_blocks,
|
||||
Torus *small_lwe_vector,
|
||||
int_radix_lut<Torus> *base_lut_object) {
|
||||
Torus *small_lwe_vector) {
|
||||
mem_reuse = true;
|
||||
this->params = params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// create single carry propagation memory object
|
||||
scp_mem = new int_sc_prop_memory<Torus>(stream, params, num_blocks_in_radix,
|
||||
@@ -1054,29 +1058,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
(int32_t *)cuda_malloc_async(max_pbs_count * sizeof(int32_t), stream);
|
||||
d_smart_copy_out =
|
||||
(int32_t *)cuda_malloc_async(max_pbs_count * sizeof(int32_t), stream);
|
||||
|
||||
// create lut object for message and carry
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
stream, params, 2, max_pbs_count, base_lut_object);
|
||||
|
||||
auto message_acc = luts_message_carry->get_lut(0);
|
||||
auto carry_acc = luts_message_carry->get_lut(1);
|
||||
|
||||
// define functions for each accumulator
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(stream, message_acc, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, lut_f_message);
|
||||
generate_device_accumulator<Torus>(stream, carry_acc, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, lut_f_carry);
|
||||
}
|
||||
|
||||
void release(cuda_stream_t *stream) {
|
||||
@@ -1090,10 +1071,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
}
|
||||
|
||||
scp_mem->release(stream);
|
||||
luts_message_carry->release(stream);
|
||||
|
||||
delete scp_mem;
|
||||
delete luts_message_carry;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1220,7 +1199,7 @@ template <typename Torus> struct int_mul_memory {
|
||||
// create memory object for sum ciphertexts
|
||||
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
stream, params, num_radix_blocks, 2 * num_radix_blocks, block_mul_res,
|
||||
vector_result_sb, small_lwe_vector, luts_array);
|
||||
vector_result_sb, small_lwe_vector);
|
||||
}
|
||||
|
||||
void release(cuda_stream_t *stream) {
|
||||
@@ -2156,6 +2135,327 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_div_rem_memory {
|
||||
int_radix_params params;
|
||||
bool mem_reuse = false;
|
||||
|
||||
// memory objects for other operations
|
||||
int_logical_scalar_shift_buffer<Torus> *shift_mem_1;
|
||||
int_logical_scalar_shift_buffer<Torus> *shift_mem_2;
|
||||
int_overflowing_sub_memory<Torus> *overflow_sub_mem;
|
||||
int_comparison_buffer<Torus> *comparison_buffer;
|
||||
|
||||
// lookup tables
|
||||
int_radix_lut<Torus> **masking_luts_1;
|
||||
int_radix_lut<Torus> **masking_luts_2;
|
||||
int_radix_lut<Torus> *message_extract_lut_1;
|
||||
int_radix_lut<Torus> *message_extract_lut_2;
|
||||
int_radix_lut<Torus> **zero_out_if_overflow_did_not_happen;
|
||||
int_radix_lut<Torus> **zero_out_if_overflow_happened;
|
||||
int_radix_lut<Torus> **merge_overflow_flags_luts;
|
||||
|
||||
// sub streams
|
||||
cuda_stream_t *sub_stream_1;
|
||||
cuda_stream_t *sub_stream_2;
|
||||
cuda_stream_t *sub_stream_3;
|
||||
cuda_stream_t *sub_stream_4;
|
||||
|
||||
// temporary device buffers
|
||||
Torus *remainder1;
|
||||
Torus *remainder2;
|
||||
Torus *numerator_block_stack;
|
||||
Torus *numerator_block_1;
|
||||
Torus *tmp_radix;
|
||||
Torus *interesting_remainder1;
|
||||
Torus *interesting_remainder2;
|
||||
Torus *interesting_divisor;
|
||||
Torus *divisor_ms_blocks;
|
||||
Torus *new_remainder;
|
||||
Torus *subtraction_overflowed;
|
||||
Torus *did_not_overflow;
|
||||
Torus *overflow_sum;
|
||||
Torus *overflow_sum_radix;
|
||||
Torus *tmp_1;
|
||||
Torus *at_least_one_upper_block_is_non_zero;
|
||||
Torus *cleaned_merged_interesting_remainder;
|
||||
|
||||
// allocate and initialize if needed, temporary arrays used to calculate
|
||||
// cuda integer div_rem operation
|
||||
void init_temporary_buffers(cuda_stream_t *stream, uint32_t num_blocks) {
|
||||
uint32_t big_lwe_size = params.big_lwe_dimension + 1;
|
||||
|
||||
// non boolean temporary arrays, with `num_blocks` blocks
|
||||
remainder1 = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), stream);
|
||||
remainder2 = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), stream);
|
||||
numerator_block_stack = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), stream);
|
||||
interesting_remainder2 = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), stream);
|
||||
interesting_divisor = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), stream);
|
||||
divisor_ms_blocks = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), stream);
|
||||
new_remainder = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), stream);
|
||||
cleaned_merged_interesting_remainder = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), stream);
|
||||
tmp_1 = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), stream);
|
||||
|
||||
// temporary arrays used as stacks
|
||||
tmp_radix = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * (num_blocks + 1) * sizeof(Torus), stream);
|
||||
interesting_remainder1 = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * (num_blocks + 1) * sizeof(Torus), stream);
|
||||
numerator_block_1 =
|
||||
(Torus *)cuda_malloc_async(big_lwe_size * 2 * sizeof(Torus), stream);
|
||||
|
||||
// temporary arrays for boolean blocks
|
||||
subtraction_overflowed =
|
||||
(Torus *)cuda_malloc_async(big_lwe_size * 1 * sizeof(Torus), stream);
|
||||
did_not_overflow =
|
||||
(Torus *)cuda_malloc_async(big_lwe_size * 1 * sizeof(Torus), stream);
|
||||
overflow_sum =
|
||||
(Torus *)cuda_malloc_async(big_lwe_size * 1 * sizeof(Torus), stream);
|
||||
overflow_sum_radix = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), stream);
|
||||
at_least_one_upper_block_is_non_zero =
|
||||
(Torus *)cuda_malloc_async(big_lwe_size * 1 * sizeof(Torus), stream);
|
||||
}
|
||||
|
||||
// initialize lookup tables for div_rem operation
|
||||
void init_lookup_tables(cuda_stream_t *stream, uint32_t num_blocks) {
|
||||
uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);
|
||||
|
||||
// create and generate masking_luts_1[] and masking_lut_2[]
|
||||
// both of them are equal but because they are used in two different
|
||||
// executions in parallel we need two different pbs_buffers.
|
||||
masking_luts_1 = new int_radix_lut<Torus> *[params.message_modulus - 1];
|
||||
masking_luts_2 = new int_radix_lut<Torus> *[params.message_modulus - 1];
|
||||
for (int i = 0; i < params.message_modulus - 1; i++) {
|
||||
uint32_t shifted_mask = i;
|
||||
std::function<Torus(Torus)> lut_f_masking =
|
||||
[shifted_mask](Torus x) -> Torus { return x & shifted_mask; };
|
||||
|
||||
masking_luts_1[i] =
|
||||
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
|
||||
masking_luts_2[i] =
|
||||
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
|
||||
|
||||
Torus *luts[2] = {masking_luts_1[i]->lut, masking_luts_2[i]->lut};
|
||||
|
||||
for (int j = 0; j < 2; j++) {
|
||||
generate_device_accumulator<Torus>(
|
||||
stream, luts[j], params.glwe_dimension, params.polynomial_size,
|
||||
params.message_modulus, params.carry_modulus, lut_f_masking);
|
||||
}
|
||||
}
|
||||
|
||||
// create and generate message_extract_lut_1 and message_extract_lut_2
|
||||
// both of them are equal but because they are used in two different
|
||||
// executions in parallel we need two different pbs_buffers.
|
||||
message_extract_lut_1 =
|
||||
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
|
||||
message_extract_lut_2 =
|
||||
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
|
||||
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto lut_f_message_extract = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
|
||||
Torus *luts[2] = {message_extract_lut_1->lut, message_extract_lut_2->lut};
|
||||
for (int j = 0; j < 2; j++) {
|
||||
generate_device_accumulator<Torus>(
|
||||
stream, luts[j], params.glwe_dimension, params.polynomial_size,
|
||||
params.message_modulus, params.carry_modulus, lut_f_message_extract);
|
||||
}
|
||||
|
||||
// Give name to closures to improve readability
|
||||
auto overflow_happened = [](uint64_t overflow_sum) {
|
||||
return overflow_sum != 0;
|
||||
};
|
||||
auto overflow_did_not_happen = [&overflow_happened](uint64_t overflow_sum) {
|
||||
return !overflow_happened(overflow_sum);
|
||||
};
|
||||
|
||||
// create and generate zero_out_if_overflow_did_not_happen
|
||||
zero_out_if_overflow_did_not_happen = new int_radix_lut<Torus> *[2];
|
||||
zero_out_if_overflow_did_not_happen[0] =
|
||||
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
|
||||
zero_out_if_overflow_did_not_happen[1] =
|
||||
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
|
||||
|
||||
auto cur_lut_f = [&](Torus block, Torus overflow_sum) -> Torus {
|
||||
if (overflow_did_not_happen(overflow_sum)) {
|
||||
return 0;
|
||||
} else {
|
||||
return block;
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator_bivariate_with_factor<Torus>(
|
||||
stream, zero_out_if_overflow_did_not_happen[0]->lut,
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, cur_lut_f, 2);
|
||||
generate_device_accumulator_bivariate_with_factor<Torus>(
|
||||
stream, zero_out_if_overflow_did_not_happen[1]->lut,
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, cur_lut_f, 3);
|
||||
|
||||
// create and generate zero_out_if_overflow_happened
|
||||
zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
|
||||
zero_out_if_overflow_happened[0] =
|
||||
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
|
||||
zero_out_if_overflow_happened[1] =
|
||||
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
|
||||
|
||||
auto overflow_happened_f = [&](Torus block, Torus overflow_sum) -> Torus {
|
||||
if (overflow_happened(overflow_sum)) {
|
||||
return 0;
|
||||
} else {
|
||||
return block;
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator_bivariate_with_factor<Torus>(
|
||||
stream, zero_out_if_overflow_happened[0]->lut, params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
overflow_happened_f, 2);
|
||||
generate_device_accumulator_bivariate_with_factor<Torus>(
|
||||
stream, zero_out_if_overflow_happened[1]->lut, params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
overflow_happened_f, 3);
|
||||
|
||||
// merge_overflow_flags_luts
|
||||
merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
|
||||
for (int i = 0; i < num_bits_in_message; i++) {
|
||||
auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
|
||||
return (x == 0 && y == 0) << i;
|
||||
};
|
||||
|
||||
merge_overflow_flags_luts[i] =
|
||||
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
stream, merge_overflow_flags_luts[i]->lut, params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_bit);
|
||||
}
|
||||
}
|
||||
|
||||
int_div_rem_memory(cuda_stream_t *stream, int_radix_params params,
|
||||
uint32_t num_blocks, bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
|
||||
stream, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, 2 * num_blocks, true);
|
||||
|
||||
shift_mem_2 = new int_logical_scalar_shift_buffer<Torus>(
|
||||
stream, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, 2 * num_blocks, true);
|
||||
|
||||
overflow_sub_mem =
|
||||
new int_overflowing_sub_memory<Torus>(stream, params, num_blocks, true);
|
||||
|
||||
comparison_buffer = new int_comparison_buffer<Torus>(
|
||||
stream, COMPARISON_TYPE::NE, params, num_blocks, false, true);
|
||||
|
||||
init_lookup_tables(stream, num_blocks);
|
||||
init_temporary_buffers(stream, num_blocks);
|
||||
|
||||
sub_stream_1 = new cuda_stream_t(stream->gpu_index);
|
||||
sub_stream_2 = new cuda_stream_t(stream->gpu_index);
|
||||
sub_stream_3 = new cuda_stream_t(stream->gpu_index);
|
||||
sub_stream_4 = new cuda_stream_t(stream->gpu_index);
|
||||
}
|
||||
|
||||
void release(cuda_stream_t *stream) {
|
||||
uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);
|
||||
|
||||
// release and delete other operation memory objects
|
||||
shift_mem_1->release(stream);
|
||||
shift_mem_2->release(stream);
|
||||
overflow_sub_mem->release(stream);
|
||||
comparison_buffer->release(stream);
|
||||
delete shift_mem_1;
|
||||
delete shift_mem_2;
|
||||
delete overflow_sub_mem;
|
||||
delete comparison_buffer;
|
||||
|
||||
// release and delete lookup tables
|
||||
|
||||
// masking_luts_1 and masking_luts_2
|
||||
for (int i = 0; i < params.message_modulus - 1; i++) {
|
||||
masking_luts_1[i]->release(stream);
|
||||
masking_luts_2[i]->release(stream);
|
||||
|
||||
delete masking_luts_1[i];
|
||||
delete masking_luts_2[i];
|
||||
}
|
||||
delete[] masking_luts_1;
|
||||
delete[] masking_luts_2;
|
||||
|
||||
// message_extract_lut_1 and message_extract_lut_2
|
||||
message_extract_lut_1->release(stream);
|
||||
message_extract_lut_2->release(stream);
|
||||
|
||||
delete message_extract_lut_1;
|
||||
delete message_extract_lut_2;
|
||||
|
||||
// zero_out_if_overflow_did_not_happen
|
||||
zero_out_if_overflow_did_not_happen[0]->release(stream);
|
||||
zero_out_if_overflow_did_not_happen[1]->release(stream);
|
||||
|
||||
delete zero_out_if_overflow_did_not_happen[0];
|
||||
delete zero_out_if_overflow_did_not_happen[1];
|
||||
|
||||
delete[] zero_out_if_overflow_did_not_happen;
|
||||
|
||||
// zero_out_if_overflow_happened
|
||||
zero_out_if_overflow_happened[0]->release(stream);
|
||||
zero_out_if_overflow_happened[1]->release(stream);
|
||||
|
||||
delete zero_out_if_overflow_happened[0];
|
||||
delete zero_out_if_overflow_happened[1];
|
||||
|
||||
delete[] zero_out_if_overflow_happened;
|
||||
|
||||
// merge_overflow_flags_luts
|
||||
for (int i = 0; i < num_bits_in_message; i++) {
|
||||
merge_overflow_flags_luts[i]->release(stream);
|
||||
|
||||
delete merge_overflow_flags_luts[i];
|
||||
}
|
||||
delete[] merge_overflow_flags_luts;
|
||||
|
||||
// release sub streams
|
||||
sub_stream_1->release();
|
||||
sub_stream_2->release();
|
||||
sub_stream_3->release();
|
||||
sub_stream_4->release();
|
||||
|
||||
// drop temporary buffers
|
||||
cuda_drop_async(remainder1, stream);
|
||||
cuda_drop_async(remainder2, stream);
|
||||
cuda_drop_async(numerator_block_stack, stream);
|
||||
cuda_drop_async(numerator_block_1, stream);
|
||||
cuda_drop_async(tmp_radix, stream);
|
||||
cuda_drop_async(interesting_remainder1, stream);
|
||||
cuda_drop_async(interesting_remainder2, stream);
|
||||
cuda_drop_async(interesting_divisor, stream);
|
||||
cuda_drop_async(divisor_ms_blocks, stream);
|
||||
cuda_drop_async(new_remainder, stream);
|
||||
cuda_drop_async(subtraction_overflowed, stream);
|
||||
cuda_drop_async(did_not_overflow, stream);
|
||||
cuda_drop_async(overflow_sum, stream);
|
||||
cuda_drop_async(overflow_sum_radix, stream);
|
||||
cuda_drop_async(tmp_1, stream);
|
||||
cuda_drop_async(at_least_one_upper_block_is_non_zero, stream);
|
||||
cuda_drop_async(cleaned_merged_interesting_remainder, stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_bitop_buffer {
|
||||
|
||||
int_radix_params params;
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include <cstdint>
|
||||
|
||||
enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
|
||||
enum PBS_VARIANT { DEFAULT = 0, CG = 1 };
|
||||
enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
|
||||
|
||||
extern "C" {
|
||||
void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
|
||||
@@ -111,6 +111,28 @@ get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_tbc(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // tbc
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
|
||||
@@ -125,6 +147,11 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
|
||||
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
@@ -213,6 +240,54 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
polynomial_size / 2 * sizeof(double2),
|
||||
stream);
|
||||
} break;
|
||||
#if CUDA_ARCH >= 900
|
||||
case PBS_VARIANT::TBC: {
|
||||
|
||||
bool supports_dsm =
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t minimum_sm_tbc = 0;
|
||||
if (supports_dsm)
|
||||
minimum_sm_tbc =
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size);
|
||||
|
||||
uint64_t partial_dm = full_sm - partial_sm;
|
||||
uint64_t full_dm = full_sm;
|
||||
uint64_t device_mem = 0;
|
||||
|
||||
// There is a minimum amount of memory we need to run the TBC PBS, which
|
||||
// is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
|
||||
// because otherwise the previous check would have redirected
|
||||
// computation to some other variant. If over that we don't have more
|
||||
// partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
|
||||
// not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
|
||||
//
|
||||
// NOSM mode actually requires minimum_sm_tbc shared memory bytes.
|
||||
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
|
||||
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
}
|
||||
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
|
||||
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2),
|
||||
stream);
|
||||
} break;
|
||||
#endif
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
@@ -281,6 +356,25 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_programmable_bootstrap_tbc(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
#endif
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_programmable_bootstrap_cg(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
@@ -295,6 +389,13 @@ void scratch_cuda_programmable_bootstrap(
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
|
||||
@@ -51,6 +51,37 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size);
|
||||
#endif
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
@@ -113,12 +144,25 @@ template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
int8_t *d_mem_keybundle = NULL;
|
||||
int8_t *d_mem_acc_step_one = NULL;
|
||||
int8_t *d_mem_acc_step_two = NULL;
|
||||
int8_t *d_mem_acc_cg = NULL;
|
||||
int8_t *d_mem_acc_tbc = NULL;
|
||||
|
||||
double2 *keybundle_fft;
|
||||
Torus *global_accumulator;
|
||||
@@ -133,18 +177,20 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
this->pbs_variant = pbs_variant;
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
|
||||
|
||||
// default
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<
|
||||
Torus>(polynomial_size);
|
||||
uint64_t full_sm_accumulate_step_one =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm_accumulate_step_one =
|
||||
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
|
||||
Torus>(polynomial_size);
|
||||
uint64_t full_sm_accumulate_step_two =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm_accumulate_step_one =
|
||||
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
|
||||
Torus>(polynomial_size);
|
||||
// cg
|
||||
uint64_t full_sm_cg_accumulate =
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
@@ -162,6 +208,19 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
auto num_blocks_acc_cg =
|
||||
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
uint64_t full_sm_tbc_accumulate =
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm_tbc_accumulate =
|
||||
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t minimum_sm_tbc =
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
auto num_blocks_acc_tbc = num_blocks_acc_cg;
|
||||
#endif
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
// Keybundle
|
||||
if (max_shared_memory < full_sm_keybundle)
|
||||
@@ -169,7 +228,16 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
num_blocks_keybundle * full_sm_keybundle, stream);
|
||||
|
||||
switch (pbs_variant) {
|
||||
case DEFAULT:
|
||||
case PBS_VARIANT::CG:
|
||||
// Accumulator CG
|
||||
if (max_shared_memory < partial_sm_cg_accumulate)
|
||||
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_cg * full_sm_cg_accumulate, stream);
|
||||
else if (max_shared_memory < full_sm_cg_accumulate)
|
||||
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_cg * partial_sm_cg_accumulate, stream);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
// Accumulator step one
|
||||
if (max_shared_memory < partial_sm_accumulate_step_one)
|
||||
d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
|
||||
@@ -183,15 +251,27 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
d_mem_acc_step_two = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_step_two * full_sm_accumulate_step_two, stream);
|
||||
break;
|
||||
case CG:
|
||||
// Accumulator CG
|
||||
if (max_shared_memory < partial_sm_cg_accumulate)
|
||||
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_cg * full_sm_cg_accumulate, stream);
|
||||
else if (max_shared_memory < full_sm_cg_accumulate)
|
||||
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_cg * partial_sm_cg_accumulate, stream);
|
||||
#if CUDA_ARCH >= 900
|
||||
case TBC:
|
||||
// There is a minimum amount of memory we need to run the TBC PBS, which
|
||||
// is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
|
||||
// because otherwise the previous check would have redirected
|
||||
// computation to some other variant. If over that we don't have more
|
||||
// partial_sm_tbc_accumulate bytes, TBC PBS will run on NOSM. If we have
|
||||
// partial_sm_tbc_accumulate but not full_sm_tbc_accumulate bytes, it
|
||||
// will run on PARTIALSM. Otherwise, FULLSM.
|
||||
//
|
||||
// NOSM mode actually requires minimum_sm_tbc shared memory bytes.
|
||||
|
||||
// Accumulator TBC
|
||||
if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc)
|
||||
d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_tbc * full_sm_tbc_accumulate, stream);
|
||||
else if (max_shared_memory < full_sm_tbc_accumulate + minimum_sm_tbc)
|
||||
d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_tbc * partial_sm_tbc_accumulate, stream);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
@@ -200,7 +280,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
|
||||
stream);
|
||||
global_accumulator = (Torus *)cuda_malloc_async(
|
||||
num_blocks_acc_step_two * polynomial_size * sizeof(Torus), stream);
|
||||
num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream);
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
|
||||
stream);
|
||||
@@ -222,6 +302,12 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
if (d_mem_acc_cg)
|
||||
cuda_drop_async(d_mem_acc_cg, stream);
|
||||
break;
|
||||
#if CUDA_ARCH >= 900
|
||||
case TBC:
|
||||
if (d_mem_acc_tbc)
|
||||
cuda_drop_async(d_mem_acc_tbc, stream);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
@@ -232,10 +318,9 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef __CUDACC__
|
||||
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t ct_count);
|
||||
|
||||
#endif
|
||||
template <typename Torus, class params>
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
#endif // CUDA_MULTI_BIT_H
|
||||
|
||||
@@ -71,6 +71,18 @@ bool cuda_check_support_cooperative_groups() {
|
||||
return cooperative_groups_supported > 0;
|
||||
}
|
||||
|
||||
/// Returns
|
||||
/// false if Thread Block Cluster is not supported.
|
||||
/// true otherwise
|
||||
bool cuda_check_support_thread_block_clusters() {
|
||||
// To-do: Is this really the best way to check support?
|
||||
int tbc_supported = 0;
|
||||
check_cuda_error(
|
||||
cudaDeviceGetAttribute(&tbc_supported, cudaDevAttrClusterLaunch, 0));
|
||||
|
||||
return tbc_supported > 0;
|
||||
}
|
||||
|
||||
/// Copy memory to the GPU asynchronously
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
|
||||
83
backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
Normal file
83
backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
Normal file
@@ -0,0 +1,83 @@
|
||||
#include "integer/div_rem.cuh"
|
||||
|
||||
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_div_rem_kb<uint64_t>(
|
||||
stream, (int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *quotient, void *remainder, void *numerator,
|
||||
void *divisor, int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks) {
|
||||
|
||||
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
switch (mem->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<512>>(
|
||||
stream, static_cast<uint64_t *>(quotient),
|
||||
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
|
||||
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
|
||||
mem, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
|
||||
host_integer_div_rem_kb<uint64_t, Degree<1024>>(
|
||||
stream, static_cast<uint64_t *>(quotient),
|
||||
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
|
||||
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
|
||||
mem, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<2048>>(
|
||||
stream, static_cast<uint64_t *>(quotient),
|
||||
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
|
||||
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
|
||||
mem, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<4096>>(
|
||||
stream, static_cast<uint64_t *>(quotient),
|
||||
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
|
||||
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
|
||||
mem, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<8192>>(
|
||||
stream, static_cast<uint64_t *>(quotient),
|
||||
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
|
||||
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
|
||||
mem, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<16384>>(
|
||||
stream, static_cast<uint64_t *>(quotient),
|
||||
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
|
||||
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
|
||||
mem, num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer div_rem): unsupported polynomial size. "
|
||||
"Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_div_rem(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_div_rem_memory<uint64_t> *mem_ptr =
|
||||
(int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
587
backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
Normal file
587
backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
Normal file
@@ -0,0 +1,587 @@
|
||||
#ifndef TFHE_RS_DIV_REM_CUH
|
||||
#define TFHE_RS_DIV_REM_CUH
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "integer/comparison.cuh"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/negation.cuh"
|
||||
#include "integer/scalar_shifts.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
int ceil_div(int a, int b) { return (a + b - 1) / b; }
|
||||
|
||||
// struct makes it easier to use list of ciphertexts and move data between them
|
||||
// struct does not allocate or drop any memory,
|
||||
// keeps track on number of ciphertexts inside list.
|
||||
template <typename Torus> struct lwe_ciphertext_list {
|
||||
Torus *data;
|
||||
size_t max_blocks;
|
||||
size_t len;
|
||||
int_radix_params params;
|
||||
|
||||
size_t big_lwe_size;
|
||||
size_t radix_size;
|
||||
size_t big_lwe_size_bytes;
|
||||
size_t radix_size_bytes;
|
||||
size_t big_lwe_dimension;
|
||||
|
||||
lwe_ciphertext_list(Torus *src, int_radix_params params, size_t max_blocks)
|
||||
: data(src), params(params), max_blocks(max_blocks) {
|
||||
big_lwe_size = params.big_lwe_dimension + 1;
|
||||
big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
radix_size = max_blocks * big_lwe_size;
|
||||
radix_size_bytes = radix_size * sizeof(Torus);
|
||||
big_lwe_dimension = params.big_lwe_dimension;
|
||||
len = max_blocks;
|
||||
}
|
||||
|
||||
// copies ciphertexts from Torus*, starting from `starting_block` including
|
||||
// `finish_block`, does not change the value of self len
|
||||
void copy_from(Torus *src, size_t start_block, size_t finish_block,
|
||||
cuda_stream_t *stream) {
|
||||
size_t tmp_len = finish_block - start_block + 1;
|
||||
cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
|
||||
tmp_len * big_lwe_size_bytes, stream);
|
||||
}
|
||||
|
||||
// copies ciphertexts from lwe_ciphertext_list, starting from `starting_block`
|
||||
// including `finish_block`, does not change the value of self len
|
||||
void copy_from(const lwe_ciphertext_list &src, size_t start_block,
|
||||
size_t finish_block, cuda_stream_t *stream) {
|
||||
copy_from(src.data, start_block, finish_block, stream);
|
||||
}
|
||||
|
||||
// copies ciphertexts from Torus*, starting from `starting_block`
|
||||
// including `finish_block`, updating the value of self len
|
||||
void clone_from(Torus *src, size_t start_block, size_t finish_block,
|
||||
cuda_stream_t *stream) {
|
||||
len = finish_block - start_block + 1;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
|
||||
len * big_lwe_size_bytes, stream);
|
||||
}
|
||||
|
||||
// copies ciphertexts from ciphertexts_list, starting from `starting_block`
|
||||
// including `finish_block`, updating the value of self len
|
||||
void clone_from(const lwe_ciphertext_list &src, size_t start_block,
|
||||
size_t finish_block, cuda_stream_t *stream) {
|
||||
clone_from(src.data, start_block, finish_block, stream);
|
||||
}
|
||||
|
||||
// assign zero to blocks starting from `start_block` including `finish_block`
|
||||
void assign_zero(size_t start_block, size_t finish_block,
|
||||
cuda_stream_t *stream) {
|
||||
auto size = finish_block - start_block + 1;
|
||||
cuda_memset_async(&data[start_block * big_lwe_size], 0,
|
||||
size * big_lwe_size_bytes, stream);
|
||||
}
|
||||
|
||||
// return pointer to last block
|
||||
Torus *last_block() { return &data[(len - 1) * big_lwe_size]; }
|
||||
|
||||
// return pointer to first_block
|
||||
Torus *first_block() { return data; }
|
||||
|
||||
// return block with `index`
|
||||
Torus *get_block(size_t index) {
|
||||
assert(index < len);
|
||||
return &data[index * big_lwe_size];
|
||||
}
|
||||
|
||||
bool is_empty() { return len == 0; }
|
||||
|
||||
// does not dop actual memory from `data`, only reduces value of `len` by one
|
||||
void pop() {
|
||||
if (len > 0)
|
||||
len--;
|
||||
else
|
||||
assert(len > 0);
|
||||
}
|
||||
|
||||
// insert ciphertext at index `ind`
|
||||
void insert(size_t ind, Torus *ciphertext_block, cuda_stream_t *stream) {
|
||||
assert(ind <= len);
|
||||
assert(len < max_blocks);
|
||||
|
||||
size_t insert_offset = ind * big_lwe_size;
|
||||
|
||||
for (size_t i = len; i > ind; i--) {
|
||||
Torus *src = &data[(i - 1) * big_lwe_size];
|
||||
Torus *dst = &data[i * big_lwe_size];
|
||||
cuda_memcpy_async_gpu_to_gpu(dst, src, big_lwe_size_bytes, stream);
|
||||
}
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(&data[insert_offset], ciphertext_block,
|
||||
big_lwe_size_bytes, stream);
|
||||
len++;
|
||||
}
|
||||
|
||||
// push ciphertext at the end of `data`
|
||||
void push(Torus *ciphertext_block, cuda_stream_t *stream) {
|
||||
assert(len < max_blocks);
|
||||
|
||||
size_t offset = len * big_lwe_size;
|
||||
cuda_memcpy_async_gpu_to_gpu(&data[offset], ciphertext_block,
|
||||
big_lwe_size_bytes, stream);
|
||||
len++;
|
||||
}
|
||||
|
||||
// duplicate ciphertext into `number_of_blocks` ciphertexts
|
||||
void fill_with_same_ciphertext(Torus *ciphertext, size_t number_of_blocks,
|
||||
cuda_stream_t *stream) {
|
||||
assert(number_of_blocks <= max_blocks);
|
||||
|
||||
for (size_t i = 0; i < number_of_blocks; i++) {
|
||||
Torus *dest = &data[i * big_lwe_size];
|
||||
cuda_memcpy_async_gpu_to_gpu(dest, ciphertext, big_lwe_size_bytes,
|
||||
stream);
|
||||
}
|
||||
|
||||
len = number_of_blocks;
|
||||
}
|
||||
|
||||
// used for debugging, prints body of each ciphertext.
|
||||
void print_blocks_body(const char *name) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
print_debug(name, &data[i * big_lwe_size + big_lwe_dimension], 1);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_div_rem_kb(
|
||||
cuda_stream_t *stream, int_div_rem_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_div_rem_memory<Torus>(stream, params, num_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_integer_div_rem_kb(cuda_stream_t *stream, Torus *quotient,
|
||||
Torus *remainder, Torus *numerator,
|
||||
Torus *divisor, void *bsk, uint64_t *ksk,
|
||||
int_div_rem_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
auto radix_params = mem_ptr->params;
|
||||
|
||||
auto big_lwe_dimension = radix_params.big_lwe_dimension;
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
uint32_t message_modulus = radix_params.message_modulus;
|
||||
uint32_t carry_modulus = radix_params.carry_modulus;
|
||||
uint32_t num_bits_in_message = 31 - __builtin_clz(message_modulus);
|
||||
uint32_t total_bits = num_bits_in_message * num_blocks;
|
||||
|
||||
// put temporary buffers in lwe_ciphertext_list for easy use
|
||||
lwe_ciphertext_list<Torus> remainder1(mem_ptr->remainder1, radix_params,
|
||||
num_blocks);
|
||||
lwe_ciphertext_list<Torus> remainder2(mem_ptr->remainder2, radix_params,
|
||||
num_blocks);
|
||||
lwe_ciphertext_list<Torus> numerator_block_stack(
|
||||
mem_ptr->numerator_block_stack, radix_params, num_blocks);
|
||||
lwe_ciphertext_list<Torus> numerator_block_1(mem_ptr->numerator_block_1,
|
||||
radix_params, 1);
|
||||
lwe_ciphertext_list<Torus> tmp_radix(mem_ptr->tmp_radix, radix_params,
|
||||
num_blocks + 1);
|
||||
lwe_ciphertext_list<Torus> interesting_remainder1(
|
||||
mem_ptr->interesting_remainder1, radix_params, num_blocks + 1);
|
||||
lwe_ciphertext_list<Torus> interesting_remainder2(
|
||||
mem_ptr->interesting_remainder2, radix_params, num_blocks);
|
||||
lwe_ciphertext_list<Torus> interesting_divisor(mem_ptr->interesting_divisor,
|
||||
radix_params, num_blocks);
|
||||
lwe_ciphertext_list<Torus> divisor_ms_blocks(mem_ptr->divisor_ms_blocks,
|
||||
radix_params, num_blocks);
|
||||
lwe_ciphertext_list<Torus> new_remainder(mem_ptr->new_remainder, radix_params,
|
||||
num_blocks);
|
||||
lwe_ciphertext_list<Torus> subtraction_overflowed(
|
||||
mem_ptr->subtraction_overflowed, radix_params, 1);
|
||||
lwe_ciphertext_list<Torus> did_not_overflow(mem_ptr->did_not_overflow,
|
||||
radix_params, 1);
|
||||
lwe_ciphertext_list<Torus> overflow_sum(mem_ptr->overflow_sum, radix_params,
|
||||
1);
|
||||
lwe_ciphertext_list<Torus> overflow_sum_radix(mem_ptr->overflow_sum_radix,
|
||||
radix_params, num_blocks);
|
||||
lwe_ciphertext_list<Torus> tmp_1(mem_ptr->tmp_1, radix_params, num_blocks);
|
||||
lwe_ciphertext_list<Torus> at_least_one_upper_block_is_non_zero(
|
||||
mem_ptr->at_least_one_upper_block_is_non_zero, radix_params, 1);
|
||||
lwe_ciphertext_list<Torus> cleaned_merged_interesting_remainder(
|
||||
mem_ptr->cleaned_merged_interesting_remainder, radix_params, num_blocks);
|
||||
|
||||
numerator_block_stack.clone_from(numerator, 0, num_blocks - 1, stream);
|
||||
remainder1.assign_zero(0, num_blocks - 1, stream);
|
||||
remainder2.assign_zero(0, num_blocks - 1, stream);
|
||||
|
||||
cuda_memset_async(quotient, 0, big_lwe_size_bytes * num_blocks, stream);
|
||||
|
||||
for (int i = total_bits - 1; i >= 0; i--) {
|
||||
uint32_t block_of_bit = i / num_bits_in_message;
|
||||
uint32_t pos_in_block = i % num_bits_in_message;
|
||||
uint32_t msb_bit_set = total_bits - 1 - i;
|
||||
uint32_t last_non_trivial_block = msb_bit_set / num_bits_in_message;
|
||||
|
||||
// Index to the first block of the remainder that is fully trivial 0
|
||||
// and all blocks after it are also trivial zeros
|
||||
// This number is in range 1..=num_bocks -1
|
||||
uint32_t first_trivial_block = last_non_trivial_block + 1;
|
||||
|
||||
interesting_remainder1.clone_from(remainder1, 0, last_non_trivial_block,
|
||||
stream);
|
||||
interesting_remainder2.clone_from(remainder2, 0, last_non_trivial_block,
|
||||
stream);
|
||||
interesting_divisor.clone_from(divisor, 0, last_non_trivial_block, stream);
|
||||
divisor_ms_blocks.clone_from(divisor,
|
||||
(msb_bit_set + 1) / num_bits_in_message,
|
||||
num_blocks - 1, stream);
|
||||
|
||||
// We split the divisor at a block position, when in reality the split
|
||||
// should be at a bit position meaning that potentially (depending on
|
||||
// msb_bit_set) the split versions share some bits they should not. So we do
|
||||
// one PBS on the last block of the interesting_divisor, and first block of
|
||||
// divisor_ms_blocks to trim out bits which should not be there
|
||||
auto trim_last_interesting_divisor_bits = [&](cuda_stream_t *stream) {
|
||||
if ((msb_bit_set + 1) % num_bits_in_message == 0) {
|
||||
return;
|
||||
}
|
||||
// The last block of the interesting part of the remainder
|
||||
// can contain bits which we should not account for
|
||||
// we have to zero them out.
|
||||
|
||||
// Where the msb is set in the block
|
||||
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
|
||||
|
||||
// e.g 2 bits in message:
|
||||
// if pos_in_block is 0, then we want to keep only first bit (right
|
||||
// shift
|
||||
// mask by 1) if pos_in_block is 1, then we want to keep the two
|
||||
// bits
|
||||
// (right shift mask by 0)
|
||||
uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1);
|
||||
|
||||
// Create mask of 1s on the message part, 0s in the carries
|
||||
uint32_t full_message_mask = message_modulus - 1;
|
||||
|
||||
// Shift the mask so that we will only keep bits we should
|
||||
uint32_t shifted_mask = full_message_mask >> shift_amount;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
stream, interesting_divisor.last_block(),
|
||||
interesting_divisor.last_block(), bsk, ksk, 1,
|
||||
mem_ptr->masking_luts_1[shifted_mask]);
|
||||
}; // trim_last_interesting_divisor_bits
|
||||
|
||||
auto trim_first_divisor_ms_bits = [&](cuda_stream_t *stream) {
|
||||
if (divisor_ms_blocks.is_empty() ||
|
||||
((msb_bit_set + 1) % num_bits_in_message) == 0) {
|
||||
return;
|
||||
}
|
||||
// Where the msb is set in the block
|
||||
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
|
||||
|
||||
// e.g 2 bits in message:
|
||||
// if pos_in_block is 0, then we want to discard the first bit (left shift
|
||||
// mask by 1) if pos_in_block is 1, then we want to discard the two bits
|
||||
// (left shift mask by 2) let shift_amount = num_bits_in_message -
|
||||
// pos_in_block
|
||||
uint32_t shift_amount = pos_in_block + 1;
|
||||
uint32_t full_message_mask = message_modulus - 1;
|
||||
uint32_t shifted_mask = full_message_mask << shift_amount;
|
||||
|
||||
// Keep the mask within the range of message bits, so that
|
||||
// the estimated degree of the output is < msg_modulus
|
||||
shifted_mask = shifted_mask & full_message_mask;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
stream, divisor_ms_blocks.first_block(),
|
||||
divisor_ms_blocks.first_block(), bsk, ksk, 1,
|
||||
mem_ptr->masking_luts_2[shifted_mask]);
|
||||
}; // trim_first_divisor_ms_bits
|
||||
|
||||
// This does
|
||||
// R := R << 1; R(0) := N(i)
|
||||
//
|
||||
// We could to that by left shifting, R by one, then unchecked_add the
|
||||
// correct numerator bit.
|
||||
//
|
||||
// However, to keep the remainder clean (noise wise), what we do is that we
|
||||
// put the remainder block from which we need to extract the bit, as the LSB
|
||||
// of the Remainder, so that left shifting will pull the bit we need.
|
||||
auto left_shift_interesting_remainder1 = [&](cuda_stream_t *stream3) {
|
||||
numerator_block_1.clone_from(numerator_block_stack,
|
||||
numerator_block_stack.len - 1,
|
||||
numerator_block_stack.len - 1, stream3);
|
||||
numerator_block_stack.pop();
|
||||
interesting_remainder1.insert(0, numerator_block_1.first_block(),
|
||||
stream3);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
stream3, interesting_remainder1.data, 1, mem_ptr->shift_mem_1, bsk,
|
||||
ksk, interesting_remainder1.len);
|
||||
|
||||
tmp_radix.clone_from(interesting_remainder1, 0,
|
||||
interesting_remainder1.len - 1, stream3);
|
||||
|
||||
radix_blocks_rotate_left<<<interesting_remainder1.len, 256, 0,
|
||||
stream3->stream>>>(
|
||||
interesting_remainder1.data, tmp_radix.data, 1,
|
||||
interesting_remainder1.len, big_lwe_size);
|
||||
|
||||
numerator_block_1.clone_from(interesting_remainder1,
|
||||
interesting_remainder1.len - 1,
|
||||
interesting_remainder1.len - 1, stream3);
|
||||
|
||||
interesting_remainder1.pop();
|
||||
|
||||
if (pos_in_block != 0) {
|
||||
// We have not yet extracted all the bits from this numerator
|
||||
// so, we put it back on the front so that it gets taken next iteration
|
||||
numerator_block_stack.push(numerator_block_1.first_block(), stream3);
|
||||
}
|
||||
}; // left_shift_interesting_remainder1
|
||||
|
||||
auto left_shift_interesting_remainder2 = [&](cuda_stream_t *stream4) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
stream4, interesting_remainder2.data, 1, mem_ptr->shift_mem_2, bsk,
|
||||
ksk, interesting_remainder2.len);
|
||||
}; // left_shift_interesting_remainder2
|
||||
|
||||
stream->synchronize();
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
// interesting_divisor
|
||||
trim_last_interesting_divisor_bits(mem_ptr->sub_stream_1);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// divisor_ms_blocks
|
||||
trim_first_divisor_ms_bits(mem_ptr->sub_stream_2);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// interesting_remainder1
|
||||
// numerator_block_stack
|
||||
left_shift_interesting_remainder1(mem_ptr->sub_stream_3);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// interesting_remainder2
|
||||
left_shift_interesting_remainder2(mem_ptr->sub_stream_4);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_1);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_2);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_3);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_4);
|
||||
|
||||
// if interesting_remainder1 != 0 -> interesting_remainder2 == 0
|
||||
// if interesting_remainder1 == 0 -> interesting_remainder2 != 0
|
||||
// In practice interesting_remainder1 contains the numerator bit,
|
||||
// but in that position, interesting_remainder2 always has a 0
|
||||
auto &merged_interesting_remainder = interesting_remainder1;
|
||||
|
||||
host_addition(stream, merged_interesting_remainder.data,
|
||||
merged_interesting_remainder.data,
|
||||
interesting_remainder2.data, radix_params.big_lwe_dimension,
|
||||
merged_interesting_remainder.len);
|
||||
|
||||
// after create_clean_version_of_merged_remainder
|
||||
// `merged_interesting_remainder` will be reused as
|
||||
// `cleaned_merged_interesting_remainder`
|
||||
cleaned_merged_interesting_remainder.clone_from(
|
||||
merged_interesting_remainder, 0, merged_interesting_remainder.len - 1,
|
||||
stream);
|
||||
|
||||
assert(merged_interesting_remainder.len == interesting_divisor.len);
|
||||
|
||||
// `new_remainder` is not initialized yet, so need to set length
|
||||
new_remainder.len = merged_interesting_remainder.len;
|
||||
|
||||
// fills:
|
||||
// `new_remainder` - radix ciphertext
|
||||
// `subtraction_overflowed` - single ciphertext
|
||||
auto do_overflowing_sub = [&](cuda_stream_t *stream) {
|
||||
host_integer_overflowing_sub_kb<Torus, params>(
|
||||
stream, new_remainder.data, subtraction_overflowed.data,
|
||||
merged_interesting_remainder.data, interesting_divisor.data, bsk, ksk,
|
||||
mem_ptr->overflow_sub_mem, merged_interesting_remainder.len);
|
||||
};
|
||||
|
||||
// fills:
|
||||
// `at_least_one_upper_block_is_non_zero` - single ciphertext
|
||||
auto check_divisor_upper_blocks = [&](cuda_stream_t *stream) {
|
||||
auto &trivial_blocks = divisor_ms_blocks;
|
||||
if (trivial_blocks.is_empty()) {
|
||||
cuda_memset_async(at_least_one_upper_block_is_non_zero.first_block(), 0,
|
||||
big_lwe_size_bytes, stream);
|
||||
} else {
|
||||
|
||||
// We could call unchecked_scalar_ne
|
||||
// But we are in the special case where scalar == 0
|
||||
// So we can skip some stuff
|
||||
host_compare_with_zero_equality(
|
||||
stream, tmp_1.data, trivial_blocks.data, mem_ptr->comparison_buffer,
|
||||
bsk, ksk, trivial_blocks.len,
|
||||
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
|
||||
|
||||
tmp_1.len =
|
||||
ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);
|
||||
|
||||
is_at_least_one_comparisons_block_true(
|
||||
stream, at_least_one_upper_block_is_non_zero.data, tmp_1.data,
|
||||
mem_ptr->comparison_buffer, bsk, ksk, tmp_1.len);
|
||||
}
|
||||
};
|
||||
|
||||
// Creates a cleaned version (noise wise) of the merged remainder
|
||||
// so that it can be safely used in bivariate PBSes
|
||||
// fills:
|
||||
// `cleaned_merged_interesting_remainder` - radix ciphertext
|
||||
auto create_clean_version_of_merged_remainder = [&](cuda_stream_t *stream) {
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
stream, cleaned_merged_interesting_remainder.data,
|
||||
cleaned_merged_interesting_remainder.data, bsk, ksk,
|
||||
cleaned_merged_interesting_remainder.len,
|
||||
mem_ptr->message_extract_lut_1);
|
||||
};
|
||||
|
||||
// phase 2
|
||||
stream->synchronize();
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
// new_remainder
|
||||
// subtraction_overflowed
|
||||
do_overflowing_sub(mem_ptr->sub_stream_1);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// at_least_one_upper_block_is_non_zero
|
||||
check_divisor_upper_blocks(mem_ptr->sub_stream_2);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// cleaned_merged_interesting_remainder
|
||||
create_clean_version_of_merged_remainder(mem_ptr->sub_stream_3);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_1);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_2);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_3);
|
||||
|
||||
host_addition(stream, overflow_sum.data, subtraction_overflowed.data,
|
||||
at_least_one_upper_block_is_non_zero.data,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
|
||||
int factor = (i) ? 3 : 2;
|
||||
int factor_lut_id = factor - 2;
|
||||
overflow_sum_radix.fill_with_same_ciphertext(
|
||||
overflow_sum.first_block(), cleaned_merged_interesting_remainder.len,
|
||||
stream);
|
||||
|
||||
auto conditionally_zero_out_merged_interesting_remainder =
|
||||
[&](cuda_stream_t *stream) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb_factor<Torus>(
|
||||
stream, cleaned_merged_interesting_remainder.data,
|
||||
cleaned_merged_interesting_remainder.data,
|
||||
overflow_sum_radix.data, bsk, ksk,
|
||||
cleaned_merged_interesting_remainder.len,
|
||||
mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
|
||||
factor);
|
||||
};
|
||||
|
||||
auto conditionally_zero_out_merged_new_remainder =
|
||||
[&](cuda_stream_t *stream) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb_factor<Torus>(
|
||||
stream, new_remainder.data, new_remainder.data,
|
||||
overflow_sum_radix.data, bsk, ksk, new_remainder.len,
|
||||
mem_ptr->zero_out_if_overflow_happened[factor_lut_id], factor);
|
||||
};
|
||||
|
||||
auto set_quotient_bit = [&](cuda_stream_t *stream) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, did_not_overflow.data, subtraction_overflowed.data,
|
||||
at_least_one_upper_block_is_non_zero.data, bsk, ksk, 1,
|
||||
mem_ptr->merge_overflow_flags_luts[pos_in_block]);
|
||||
|
||||
host_addition(stream, "ient[block_of_bit * big_lwe_size],
|
||||
"ient[block_of_bit * big_lwe_size],
|
||||
did_not_overflow.data, radix_params.big_lwe_dimension, 1);
|
||||
};
|
||||
|
||||
stream->synchronize();
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
// cleaned_merged_interesting_remainder
|
||||
conditionally_zero_out_merged_interesting_remainder(
|
||||
mem_ptr->sub_stream_1);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// new_remainder
|
||||
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_stream_2);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// quotient
|
||||
set_quotient_bit(mem_ptr->sub_stream_3);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_1);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_2);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_3);
|
||||
|
||||
assert(first_trivial_block - 1 == cleaned_merged_interesting_remainder.len);
|
||||
assert(first_trivial_block - 1 == new_remainder.len);
|
||||
|
||||
remainder1.copy_from(cleaned_merged_interesting_remainder, 0,
|
||||
first_trivial_block - 1, stream);
|
||||
remainder2.copy_from(new_remainder, 0, first_trivial_block - 1, stream);
|
||||
}
|
||||
|
||||
assert(remainder1.len == remainder2.len);
|
||||
|
||||
// Clean the quotient and remainder
|
||||
// as even though they have no carries, they are not at nominal noise level
|
||||
host_addition(stream, remainder, remainder1.data, remainder2.data,
|
||||
radix_params.big_lwe_dimension, remainder1.len);
|
||||
|
||||
stream->synchronize();
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
mem_ptr->sub_stream_1, remainder, remainder, bsk, ksk, num_blocks,
|
||||
mem_ptr->message_extract_lut_1);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
mem_ptr->sub_stream_2, quotient, quotient, bsk, ksk, num_blocks,
|
||||
mem_ptr->message_extract_lut_2);
|
||||
}
|
||||
}
|
||||
mem_ptr->sub_stream_1->synchronize();
|
||||
mem_ptr->sub_stream_2->synchronize();
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_DIV_REM_CUH
|
||||
@@ -141,3 +141,41 @@ void cleanup_cuda_propagate_single_carry(cuda_stream_t *stream,
|
||||
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
void scratch_cuda_apply_univariate_lut_kb_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, void *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_apply_univariate_lut_kb<uint64_t>(
|
||||
stream, (int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_apply_univariate_lut_kb_64(cuda_stream_t *stream,
|
||||
void *output_radix_lwe,
|
||||
void *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *ksk, void *bsk,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
host_apply_univariate_lut_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(output_radix_lwe),
|
||||
static_cast<uint64_t *>(input_radix_lwe),
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk), bsk,
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
@@ -172,6 +172,47 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_bivariate_lookup_table_kb_factor(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
|
||||
Torus *lwe_array_2, void *bsk, Torus *ksk, uint32_t num_radix_blocks,
|
||||
int_radix_lut<Torus> *lut, uint32_t shift) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// apply_lookup_table_bivariate
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto small_lwe_dimension = params.small_lwe_dimension;
|
||||
auto ks_level = params.ks_level;
|
||||
auto ks_base_log = params.ks_base_log;
|
||||
auto pbs_level = params.pbs_level;
|
||||
auto pbs_base_log = params.pbs_base_log;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto grouping_factor = params.grouping_factor;
|
||||
auto message_modulus = params.message_modulus;
|
||||
|
||||
// Left message is shifted
|
||||
auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
|
||||
pack_bivariate_blocks(stream, lwe_array_pbs_in, lut->lwe_trivial_indexes,
|
||||
lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
|
||||
big_lwe_dimension, shift, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// Apply LUT
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
stream, lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, lwe_array_pbs_in,
|
||||
lut->lwe_trivial_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
|
||||
execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes_out, lut->lut,
|
||||
lut->lut_indexes, lut->tmp_lwe_after_ks,
|
||||
lut->lwe_trivial_indexes, bsk, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, 1, 0,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
|
||||
}
|
||||
|
||||
// Rotates the slice in-place such that the first mid elements of the slice move
|
||||
// to the end while the last array_length elements move to the front. After
|
||||
// calling rotate_left, the element previously at index mid will become the
|
||||
@@ -235,6 +276,24 @@ void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
|
||||
message_modulus, carry_modulus, wrapped_f);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void generate_lookup_table_bivariate_with_factor(
|
||||
Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::function<Torus(Torus, Torus)> f, int factor) {
|
||||
|
||||
Torus factor_u64 = factor;
|
||||
auto wrapped_f = [factor_u64, message_modulus, f](Torus input) -> Torus {
|
||||
Torus lhs = (input / factor_u64) % message_modulus;
|
||||
Torus rhs = (input % factor_u64) % message_modulus;
|
||||
|
||||
return f(lhs, rhs);
|
||||
};
|
||||
|
||||
generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, wrapped_f);
|
||||
}
|
||||
|
||||
/*
|
||||
* generate bivariate accumulator for device pointer
|
||||
* v_stream - cuda stream
|
||||
@@ -266,7 +325,38 @@ void generate_device_accumulator_bivariate(
|
||||
}
|
||||
|
||||
/*
|
||||
* generate bivariate accumulator for device pointer
|
||||
* generate bivariate accumulator with factor scaling for device pointer
|
||||
* v_stream - cuda stream
|
||||
* acc - device pointer for bivariate accumulator
|
||||
* ...
|
||||
* f - wrapping function with two Torus inputs
|
||||
*/
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator_bivariate_with_factor(
|
||||
cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::function<Torus(Torus, Torus)> f, int factor) {
|
||||
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
|
||||
// fill bivariate accumulator
|
||||
generate_lookup_table_bivariate_with_factor<Torus>(
|
||||
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
|
||||
factor);
|
||||
|
||||
// copy host lut and lut_indexes to device
|
||||
cuda_memcpy_async_to_gpu(
|
||||
acc_bivariate, h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
|
||||
|
||||
// Release memory when possible
|
||||
cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
|
||||
}
|
||||
|
||||
/*
|
||||
* generate accumulator for device pointer
|
||||
* v_stream - cuda stream
|
||||
* acc - device pointer for accumulator
|
||||
* ...
|
||||
@@ -771,4 +861,31 @@ __host__ void reduce_signs(cuda_stream_t *stream, Torus *signs_array_out,
|
||||
signs_a, bsk, ksk, 1, lut);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_apply_univariate_lut_kb(cuda_stream_t *stream,
|
||||
int_radix_lut<Torus> **mem_ptr,
|
||||
Torus *input_lut,
|
||||
uint32_t num_radix_blocks,
|
||||
int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_radix_lut<Torus>(stream, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
cuda_memcpy_async_to_gpu((*mem_ptr)->lut, input_lut,
|
||||
(params.glwe_dimension + 1) *
|
||||
params.polynomial_size * sizeof(Torus),
|
||||
stream);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_apply_univariate_lut_kb(cuda_stream_t *stream, Torus *radix_lwe_out,
|
||||
Torus *radix_lwe_in,
|
||||
int_radix_lut<Torus> *mem, Torus *ksk,
|
||||
void *bsk, uint32_t num_blocks) {
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, radix_lwe_out, radix_lwe_in, bsk, ksk, num_blocks, mem);
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_INTERNAL_INTEGER_CUH
|
||||
|
||||
@@ -79,14 +79,20 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
case 512:
|
||||
case 1024:
|
||||
case 2048:
|
||||
case 4096:
|
||||
case 8192:
|
||||
case 16384:
|
||||
scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
|
||||
stream, (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Only N = 2048 is supported")
|
||||
"Supported N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,6 +134,30 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
@@ -136,9 +166,33 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Only N = 2048 is supported")
|
||||
"Supported N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -225,8 +279,8 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
num_radix_in_vec);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer sum ciphertexts): unsupported polynomial size. "
|
||||
"Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Supported N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
|
||||
free(terms_degree);
|
||||
|
||||
@@ -91,12 +91,15 @@ all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, sharedMemDegree SMD>
|
||||
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
uint32_t chunk_size, uint32_t block_size,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
extern __shared__ Torus result[];
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
|
||||
Torus *result = (Torus *)sharedmem;
|
||||
|
||||
size_t stride = blockDim.x;
|
||||
size_t chunk_id = blockIdx.x;
|
||||
size_t chunk_elem_size = chunk_size * num_blocks * block_size;
|
||||
@@ -106,6 +109,9 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
size_t block_stride = blockIdx.y * block_size;
|
||||
auto dst_block = &dst_radix[block_stride];
|
||||
|
||||
if constexpr (SMD == NOSM)
|
||||
result = dst_block;
|
||||
|
||||
// init shared mem with first radix of chunk
|
||||
size_t tid = threadIdx.x;
|
||||
for (int i = tid; i < block_size; i += stride) {
|
||||
@@ -121,9 +127,9 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
}
|
||||
|
||||
// put result from shared mem to global mem
|
||||
for (int i = tid; i < block_size; i += stride) {
|
||||
dst_block[i] = result[i];
|
||||
}
|
||||
if constexpr (SMD == FULLSM)
|
||||
for (int i = tid; i < block_size; i += stride)
|
||||
dst_block[i] = result[i];
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
@@ -181,11 +187,20 @@ __host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
if (sm_size < cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(
|
||||
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
stream, params, num_blocks_in_radix, max_num_radix_in_vec,
|
||||
allocate_gpu_memory);
|
||||
@@ -203,11 +218,6 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
auto old_blocks = mem_ptr->old_blocks;
|
||||
auto small_lwe_vector = mem_ptr->small_lwe_vector;
|
||||
|
||||
auto luts_message_carry = mem_ptr->luts_message_carry;
|
||||
|
||||
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
|
||||
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
|
||||
|
||||
auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
|
||||
auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
|
||||
|
||||
@@ -247,8 +257,16 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
dim3 add_grid(ch_amount, num_blocks, 1);
|
||||
size_t sm_size = big_lwe_size * sizeof(Torus);
|
||||
|
||||
tree_add_chunks<Torus><<<add_grid, 512, sm_size, stream->stream>>>(
|
||||
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
|
||||
if (sm_size < max_shared_memory)
|
||||
tree_add_chunks<Torus, FULLSM>
|
||||
<<<add_grid, 512, sm_size, stream->stream>>>(
|
||||
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size,
|
||||
num_blocks);
|
||||
else
|
||||
tree_add_chunks<Torus, NOSM><<<add_grid, 512, 0, stream->stream>>>(
|
||||
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
|
||||
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
size_t total_count = 0;
|
||||
size_t message_count = 0;
|
||||
@@ -260,6 +278,34 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
|
||||
total_count, message_count, carry_count, sm_copy_count);
|
||||
|
||||
// create lut object for message and carry
|
||||
// we allocate luts_message_carry in the host function (instead of scratch)
|
||||
// to reduce average memory consumption
|
||||
auto luts_message_carry =
|
||||
new int_radix_lut<Torus>(stream, mem_ptr->params, 2, total_count, true);
|
||||
|
||||
auto message_acc = luts_message_carry->get_lut(0);
|
||||
auto carry_acc = luts_message_carry->get_lut(1);
|
||||
|
||||
// define functions for each accumulator
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(stream, message_acc, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, lut_f_message);
|
||||
generate_device_accumulator<Torus>(stream, carry_acc, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, lut_f_carry);
|
||||
|
||||
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
|
||||
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
|
||||
|
||||
size_t copy_size = total_count * sizeof(Torus);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size, stream);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size, stream);
|
||||
@@ -272,6 +318,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
smart_copy<<<sm_copy_count, 256, 0, stream->stream>>>(
|
||||
new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
|
||||
big_lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
if (carry_count > 0)
|
||||
cuda_set_value_async<Torus>(
|
||||
@@ -291,6 +338,8 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor, total_count,
|
||||
2, 0, max_shared_memory, mem_ptr->params.pbs_type);
|
||||
|
||||
luts_message_carry->release(stream);
|
||||
|
||||
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
|
||||
int new_blocks_created = 2 * ch_amount * num_blocks;
|
||||
copy_size = rem_blocks * big_lwe_size * sizeof(Torus);
|
||||
@@ -386,6 +435,7 @@ __host__ void host_integer_mult_radix_kb(
|
||||
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, stream->stream>>>(
|
||||
radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
|
||||
vector_lsb_rhs, vector_msb_rhs, num_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, block_mul_res, block_mul_res, vector_result_sb, bsk, ksk,
|
||||
@@ -401,6 +451,7 @@ __host__ void host_integer_mult_radix_kb(
|
||||
vector_result_msb, glwe_dimension,
|
||||
lsb_vector_block_count, msb_vector_block_count,
|
||||
num_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
int terms_degree[2 * num_blocks * num_blocks];
|
||||
for (int i = 0; i < num_blocks * num_blocks; i++) {
|
||||
@@ -427,11 +478,20 @@ __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
if (sm_size < cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(
|
||||
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
*mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
|
||||
@@ -35,11 +35,20 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(T);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<T>, cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<T>, cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
if (sm_size < cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<T, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<T, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(
|
||||
cudaFuncSetAttribute(tree_add_chunks<T, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<T, NOSM>, cudaFuncCachePreferL1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
*mem_ptr = new int_scalar_mul_buffer<T>(stream, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
|
||||
@@ -1 +1,44 @@
|
||||
#include "programmable_bootstrap.cuh"
|
||||
|
||||
template <>
|
||||
__device__ int get_this_block_rank(grid_group &group, bool support_dsm) {
|
||||
return blockIdx.y;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ double2 *
|
||||
get_join_buffer_element(int level_id, int glwe_id, grid_group &group,
|
||||
double2 *global_memory_buffer, uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, bool support_dsm) {
|
||||
double2 *buffer_slice =
|
||||
global_memory_buffer +
|
||||
(glwe_id + level_id * (glwe_dimension + 1)) * polynomial_size / 2;
|
||||
return buffer_slice;
|
||||
}
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
template <>
|
||||
__device__ int get_this_block_rank(cluster_group &cluster, bool support_dsm) {
|
||||
if (support_dsm)
|
||||
return cluster.block_rank();
|
||||
else
|
||||
return blockIdx.y;
|
||||
}
|
||||
template <>
|
||||
__device__ double2 *
|
||||
get_join_buffer_element(int level_id, int glwe_id, cluster_group &cluster,
|
||||
double2 *global_memory_buffer, uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, bool support_dsm) {
|
||||
double2 *buffer_slice;
|
||||
if (support_dsm) {
|
||||
extern __shared__ double2 smem[];
|
||||
buffer_slice = cluster.map_shared_rank(
|
||||
smem, glwe_id + level_id * (glwe_dimension + 1));
|
||||
} else {
|
||||
buffer_slice =
|
||||
global_memory_buffer +
|
||||
(glwe_id + level_id * (glwe_dimension + 1)) * polynomial_size / 2;
|
||||
}
|
||||
return buffer_slice;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,8 +1,120 @@
|
||||
#include "../../include/device.h"
|
||||
#include "../../include/programmable_bootstrap.h"
|
||||
#include "../include/device.h"
|
||||
#include "programmable_bootstrap_classic.cuh"
|
||||
#include "programmable_bootstrap_multibit.cuh"
|
||||
#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
|
||||
#define CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
|
||||
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_multibit.h"
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
using namespace cooperative_groups;
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
template <typename G>
|
||||
__device__ int get_this_block_rank(G &group, bool support_dsm);
|
||||
|
||||
template <typename G>
|
||||
__device__ double2 *
|
||||
get_join_buffer_element(int level_id, int glwe_id, G &group,
|
||||
double2 *global_memory_buffer, uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, bool support_dsm);
|
||||
|
||||
template <typename Torus, typename G, class params>
|
||||
__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
|
||||
double2 *join_buffer, double2 *bootstrapping_key,
|
||||
int polynomial_size, uint32_t glwe_dimension,
|
||||
int level_count, int iteration, G &group,
|
||||
bool support_dsm = false) {
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Get the pieces of the bootstrapping key that will be needed for the
|
||||
// external product; blockIdx.x is the ID of the block that's executing
|
||||
// this function, so we end up getting the lines of the bootstrapping key
|
||||
// needed to perform the external product in this block (corresponding to
|
||||
// the same decomposition level)
|
||||
auto bsk_slice = get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
|
||||
// Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
// each block operating on a single level for mask and body
|
||||
|
||||
// The first product is used to initialize level_join_buffer
|
||||
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
|
||||
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
|
||||
auto buffer_slice =
|
||||
get_join_buffer_element<G>(blockIdx.x, blockIdx.y, group, join_buffer,
|
||||
polynomial_size, glwe_dimension, support_dsm);
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
group.sync();
|
||||
|
||||
// Continues multiplying fft by every polynomial in that particular bsk level
|
||||
// Each y-block accumulates in a different polynomial at each iteration
|
||||
for (int j = 1; j < (glwe_dimension + 1); j++) {
|
||||
int idx = (j + this_block_rank) % (glwe_dimension + 1);
|
||||
|
||||
auto bsk_poly = bsk_slice + idx * params::degree / 2;
|
||||
auto buffer_slice = get_join_buffer_element<G>(blockIdx.x, idx, group,
|
||||
join_buffer, polynomial_size,
|
||||
glwe_dimension, support_dsm);
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
group.sync();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// All blocks are synchronized here; after this sync, level_join_buffer has
|
||||
// the values needed from every other block
|
||||
|
||||
auto src_acc =
|
||||
get_join_buffer_element<G>(0, blockIdx.y, group, join_buffer,
|
||||
polynomial_size, glwe_dimension, support_dsm);
|
||||
|
||||
// copy first product into fft buffer
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 1; l < gridDim.x; l++) {
|
||||
auto cur_src_acc = get_join_buffer_element<G>(l, blockIdx.y, group,
|
||||
join_buffer, polynomial_size,
|
||||
glwe_dimension, support_dsm);
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] += cur_src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
|
||||
// accumulator
|
||||
NSMFFT_inverse<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(fft, accumulator);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
@@ -112,3 +224,5 @@ void execute_scratch_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
|
||||
"moduli are supported.")
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -67,8 +67,7 @@ void scratch_cuda_programmable_bootstrap_amortized_32(
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
"N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
#endif
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
@@ -15,104 +14,13 @@
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
// Cooperative groups are used for this implementation
|
||||
using namespace cooperative_groups;
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
|
||||
double2 *join_buffer, double2 *bootstrapping_key,
|
||||
int polynomial_size, uint32_t glwe_dimension,
|
||||
int level_count, int iteration,
|
||||
grid_group &grid) {
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Get the pieces of the bootstrapping key that will be needed for the
|
||||
// external product; blockIdx.x is the ID of the block that's executing
|
||||
// this function, so we end up getting the lines of the bootstrapping key
|
||||
// needed to perform the external product in this block (corresponding to
|
||||
// the same decomposition level)
|
||||
auto bsk_slice = get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
|
||||
// Selects all GLWEs in a particular decomposition level
|
||||
auto level_join_buffer =
|
||||
join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2;
|
||||
|
||||
// Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
// each block operating on a single level for mask and body
|
||||
|
||||
// The first product is used to initialize level_join_buffer
|
||||
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
|
||||
auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
grid.sync();
|
||||
|
||||
// Continues multiplying fft by every polynomial in that particular bsk level
|
||||
// Each y-block accumulates in a different polynomial at each iteration
|
||||
for (int j = 1; j < (glwe_dimension + 1); j++) {
|
||||
int idx = (j + blockIdx.y) % (glwe_dimension + 1);
|
||||
|
||||
auto bsk_poly = bsk_slice + idx * params::degree / 2;
|
||||
auto buffer_slice = level_join_buffer + idx * params::degree / 2;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
grid.sync();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// All blocks are synchronized here; after this sync, level_join_buffer has
|
||||
// the values needed from every other block
|
||||
|
||||
auto src_acc = join_buffer + blockIdx.y * params::degree / 2;
|
||||
|
||||
// copy first product into fft buffer
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 1; l < gridDim.x; l++) {
|
||||
auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2];
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] += cur_src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
|
||||
// accumulator
|
||||
NSMFFT_inverse<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(fft, accumulator);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
/*
|
||||
* Kernel that computes the classical PBS using cooperative groups
|
||||
*
|
||||
@@ -222,7 +130,7 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, params>(
|
||||
mul_ggsw_glwe<Torus, grid_group, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
|
||||
polynomial_size, glwe_dimension, level_count, i, grid);
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#ifndef CUDA_FAST_MULTIBIT_PBS_CUH
|
||||
#define CUDA_FAST_MULTIBIT_PBS_CUH
|
||||
#ifndef CUDA_CG_MULTIBIT_PBS_CUH
|
||||
#define CUDA_CG_MULTIBIT_PBS_CUH
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
#include "crypto/gadget.cuh"
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_multibit.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
@@ -106,9 +107,9 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, params>(accumulator, accumulator_fft,
|
||||
block_join_buffer, keybundle, polynomial_size,
|
||||
glwe_dimension, level_count, i, grid);
|
||||
mul_ggsw_glwe<Torus, grid_group, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, keybundle,
|
||||
polynomial_size, glwe_dimension, level_count, i, grid);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
@@ -240,7 +241,9 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
|
||||
}
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size(input_lwe_ciphertext_count);
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
stream->gpu_index, input_lwe_ciphertext_count, polynomial_size,
|
||||
max_shared_memory);
|
||||
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
|
||||
stream, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
|
||||
@@ -248,7 +251,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_external_product_loop(
|
||||
__host__ void execute_cg_external_product_loop(
|
||||
cuda_stream_t *stream, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
|
||||
@@ -336,7 +339,8 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size(num_samples);
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
stream->gpu_index, num_samples, polynomial_size, max_shared_memory);
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
@@ -349,7 +353,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
|
||||
lwe_chunk_size, lwe_offset);
|
||||
|
||||
// Accumulate
|
||||
execute_external_product_loop<Torus, params>(
|
||||
execute_cg_external_product_loop<Torus, params>(
|
||||
stream, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
#include "programmable_bootstrap_cg_classic.cuh"
|
||||
#include "programmable_bootstrap_classic.cuh"
|
||||
#if (CUDA_ARCH >= 900)
|
||||
#include "programmable_bootstrap_tbc_classic.cuh"
|
||||
#endif
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
|
||||
@@ -12,6 +15,176 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
|
||||
max_shared_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
#if CUDA_ARCH >= 900
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 512:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 1024:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 2048:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 4096:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 8192:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 16384:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_programmable_bootstrap_tbc(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<256>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<512>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<1024>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<2048>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<4096>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<8192>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<16384>>(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_tbc<Torus, Degree<512>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Returns the buffer size for 64 bits executions
|
||||
*/
|
||||
@@ -143,9 +316,19 @@ void scratch_cuda_programmable_bootstrap_32(
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
|
||||
stream, (pbs_buffer<uint32_t, CLASSICAL> **)buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
#endif
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_cg<uint32_t, int32_t>(
|
||||
stream, (pbs_buffer<uint32_t, CLASSICAL> **)buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
@@ -168,9 +351,19 @@ void scratch_cuda_programmable_bootstrap_64(
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
|
||||
stream, (pbs_buffer<uint64_t, CLASSICAL> **)buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
#endif
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_cg<uint64_t, int64_t>(
|
||||
stream, (pbs_buffer<uint64_t, CLASSICAL> **)buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
@@ -321,7 +514,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
@@ -330,9 +523,28 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
PANIC("Cuda error (classical PBS): base log should be > number of bits "
|
||||
"in the ciphertext representation (32)");
|
||||
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
glwe_dimension, polynomial_size, level_count, num_samples,
|
||||
max_shared_memory))
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer =
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
case TBC:
|
||||
#if CUDA_ARCH >= 900
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key),
|
||||
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
#else
|
||||
PANIC("Cuda error (PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
break;
|
||||
case CG:
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
@@ -344,7 +556,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
else
|
||||
break;
|
||||
case DEFAULT:
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
@@ -356,6 +569,10 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown pbs variant.")
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform bootstrapping on a batch of input u64 LWE ciphertexts.
|
||||
@@ -433,7 +650,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
@@ -441,9 +658,28 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
PANIC("Cuda error (classical PBS): base log should be > number of bits "
|
||||
"in the ciphertext representation (64)");
|
||||
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count, num_samples,
|
||||
max_shared_memory))
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer =
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
case PBS_VARIANT::TBC:
|
||||
#if (CUDA_ARCH >= 900)
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key),
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
#else
|
||||
PANIC("Cuda error (PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
break;
|
||||
case PBS_VARIANT::CG:
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
@@ -455,7 +691,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
else
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
@@ -467,6 +704,10 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown pbs variant.")
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -546,3 +787,41 @@ template void scratch_cuda_programmable_bootstrap<uint32_t, int32_t>(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
|
||||
cuda_stream_t *stream, uint32_t *lwe_array_out,
|
||||
uint32_t *lwe_output_indexes, uint32_t *lut_vector,
|
||||
uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
|
||||
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_stream_t *stream, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
template void scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
|
||||
cuda_stream_t *stream, pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
template void scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
|
||||
cuda_stream_t *stream, pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
#endif
|
||||
|
||||
@@ -3,6 +3,10 @@
|
||||
#include "programmable_bootstrap_multibit.cuh"
|
||||
#include "programmable_bootstrap_multibit.h"
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
#include "programmable_bootstrap_tbc_multibit.cuh"
|
||||
#endif
|
||||
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t max_shared_memory) {
|
||||
@@ -11,6 +15,57 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
max_shared_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
#if CUDA_ARCH >= 900
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 512:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 1024:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 2048:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 4096:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 8192:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 16384:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
@@ -185,15 +240,34 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
|
||||
|
||||
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count, num_samples,
|
||||
max_shared_memory))
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
case PBS_VARIANT::TBC:
|
||||
#if CUDA_ARCH >= 900
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key),
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
#else
|
||||
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
break;
|
||||
case PBS_VARIANT::CG:
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
@@ -201,11 +275,11 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key),
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
|
||||
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
else
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
@@ -213,10 +287,13 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key),
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
|
||||
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
@@ -356,9 +433,20 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size) {
|
||||
|
||||
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, max_shared_memory))
|
||||
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count, grouping_factor,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
else
|
||||
#endif
|
||||
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
|
||||
uint64_t>(glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count, grouping_factor,
|
||||
@@ -378,25 +466,69 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(cuda_stream_t *stream,
|
||||
x->release(stream);
|
||||
}
|
||||
|
||||
// Returns a chunk size that is not optimal but close to
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t ct_count) {
|
||||
/**
|
||||
* Computes divisors of the product of num_sms (streaming multiprocessors on the
|
||||
* GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
|
||||
* device_multi_bit_programmable_bootstrap_keybundle) smaller than its square
|
||||
* root, based on max_num_pbs. If log2(max_num_pbs) <= 13, selects the first
|
||||
* suitable divisor. If greater, calculates an offset as max(1,log2(max_num_pbs)
|
||||
* - 13) for additional logic.
|
||||
*
|
||||
* The value 13 was empirically determined based on memory requirements for
|
||||
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
// Tesla H100
|
||||
return (ct_count > 10000) ? 30 : 64;
|
||||
#elif CUDA_ARCH >= 890
|
||||
// Tesla RTX4090
|
||||
return 8;
|
||||
#elif CUDA_ARCH >= 800
|
||||
// Tesla A100
|
||||
return (ct_count > 10000) ? 30 : 45;
|
||||
#elif CUDA_ARCH >= 700
|
||||
// Tesla V100
|
||||
return (ct_count > 10000) ? 12 : 18;
|
||||
#else
|
||||
// Generic case
|
||||
return (ct_count > 10000) ? 2 : 1;
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int max_blocks_per_sm;
|
||||
if (max_shared_memory < full_sm_keybundle)
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
polynomial_size / params::opt, full_sm_keybundle);
|
||||
else
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
polynomial_size / params::opt, 0);
|
||||
|
||||
int num_sms = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
|
||||
|
||||
int x = num_sms * max_blocks_per_sm;
|
||||
int count = 0;
|
||||
|
||||
int divisor = 1;
|
||||
int ith_divisor = 0;
|
||||
|
||||
#if CUDA_ARCH < 900
|
||||
// We pick a smaller divisor on GPUs other than H100, so 256-bit integer
|
||||
// multiplication can run
|
||||
int log2_max_num_pbs = std::log2(max_num_pbs);
|
||||
if (log2_max_num_pbs > 13)
|
||||
ith_divisor = log2_max_num_pbs - 11;
|
||||
#endif
|
||||
|
||||
for (int i = sqrt(x); i >= 1; i--) {
|
||||
if (x % i == 0) {
|
||||
if (count == ith_divisor) {
|
||||
divisor = i;
|
||||
break;
|
||||
} else {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return divisor;
|
||||
}
|
||||
|
||||
template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
@@ -437,3 +569,180 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size);
|
||||
|
||||
template bool
|
||||
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<256>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 512:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<512>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<1024>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<2048>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<4096>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<8192>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<16384>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
template <typename Torus>
|
||||
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
|
||||
"the ciphertext representation (64)");
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t,
|
||||
AmortizedDegree<256>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 512:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<512>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 1024:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 2048:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 4096:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 8192:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 16384:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
template void
|
||||
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
|
||||
|
||||
template void
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_stream_t *stream, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size);
|
||||
#endif
|
||||
|
||||
@@ -227,8 +227,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
gadget_acc.decompose_and_compress_next_polynomial(accumulator_fft,
|
||||
blockIdx.x);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
@@ -469,7 +468,9 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
|
||||
}
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size(input_lwe_ciphertext_count);
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
stream->gpu_index, input_lwe_ciphertext_count, polynomial_size,
|
||||
max_shared_memory);
|
||||
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
|
||||
stream, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
|
||||
@@ -630,7 +631,8 @@ __host__ void host_multi_bit_programmable_bootstrap(
|
||||
|
||||
// If a chunk size is not passed to this function, select one.
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size(num_samples);
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
stream->gpu_index, num_samples, polynomial_size, max_shared_memory);
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
@@ -0,0 +1,424 @@
|
||||
#ifndef CUDA_TBC_PBS_CUH
|
||||
#define CUDA_TBC_PBS_CUH
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
using namespace cooperative_groups;
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
/*
|
||||
* Kernel that computes the classical PBS using cooperative groups
|
||||
*
|
||||
* - lwe_array_out: vector of output lwe s, with length
|
||||
* (glwe_dimension * polynomial_size+1)*num_samples
|
||||
* - lut_vector: vector of look up tables with
|
||||
* length (glwe_dimension+1) * polynomial_size * num_samples
|
||||
* - lut_vector_indexes: mapping between lwe_array_in and lut_vector
|
||||
* lwe_array_in: vector of lwe inputs with length (lwe_dimension + 1) *
|
||||
* num_samples
|
||||
*
|
||||
* Each y-block computes one element of the lwe_array_out.
|
||||
*/
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_programmable_bootstrap_tbc(
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block,
|
||||
bool support_dsm) {
|
||||
|
||||
cluster_group cluster = this_cluster();
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory;
|
||||
uint32_t glwe_dimension = gridDim.y - 1;
|
||||
|
||||
if constexpr (SMD == FULLSM) {
|
||||
selected_memory = sharedmem;
|
||||
if (support_dsm)
|
||||
selected_memory += sizeof(Torus) * polynomial_size;
|
||||
} else {
|
||||
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
|
||||
blockIdx.z * gridDim.x * gridDim.y;
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
}
|
||||
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
Torus *accumulator_rotated =
|
||||
(Torus *)accumulator + (ptrdiff_t)polynomial_size;
|
||||
double2 *accumulator_fft =
|
||||
(double2 *)accumulator_rotated +
|
||||
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
|
||||
|
||||
if constexpr (SMD == PARTIALSM) {
|
||||
accumulator_fft = (double2 *)sharedmem;
|
||||
if (support_dsm)
|
||||
accumulator_fft += (ptrdiff_t)(polynomial_size / 2);
|
||||
}
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
|
||||
|
||||
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
|
||||
double2 *block_join_buffer =
|
||||
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
|
||||
params::degree / 2];
|
||||
// Since the space is L1 cache is small, we use the same memory location for
|
||||
// the rotated accumulator and the fft accumulator, since we know that the
|
||||
// rotated array is not in use anymore by the time we perform the fft
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[i], a_hat,
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, level_count);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, cluster_group, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
|
||||
polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result, but
|
||||
// we do the computation at block 0 to avoid waiting for extra blocks, in
|
||||
// case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_programmable_bootstrap_tbc(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
bool supports_dsm =
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
|
||||
uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t minimum_sm_tbc = 0;
|
||||
if (supports_dsm)
|
||||
minimum_sm_tbc =
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
if (max_shared_memory >= full_sm + minimum_sm_tbc) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm + minimum_sm_tbc));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm + minimum_sm_tbc) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
partial_sm + minimum_sm_tbc));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, minimum_sm_tbc));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
*buffer = new pbs_buffer<Torus, CLASSICAL>(
|
||||
stream, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, PBS_VARIANT::TBC, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
/*
|
||||
* Host wrapper
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_programmable_bootstrap_tbc(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_luts, uint32_t max_shared_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
auto supports_dsm =
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t minimum_sm_tbc = 0;
|
||||
if (supports_dsm)
|
||||
minimum_sm_tbc =
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
uint64_t full_dm = full_sm;
|
||||
|
||||
uint64_t partial_dm = full_dm - partial_sm;
|
||||
|
||||
int8_t *d_mem = buffer->d_mem;
|
||||
double2 *buffer_fft = buffer->global_accumulator_fft;
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
|
||||
|
||||
cudaLaunchConfig_t config = {0};
|
||||
// The grid dimension is not affected by cluster launch, and is still
|
||||
// enumerated using number of blocks. The grid dimension should be a multiple
|
||||
// of cluster size.
|
||||
config.gridDim = grid;
|
||||
config.blockDim = thds;
|
||||
|
||||
cudaLaunchAttribute attribute[1];
|
||||
attribute[0].id = cudaLaunchAttributeClusterDimension;
|
||||
attribute[0].val.clusterDim.x = level_count; // Cluster size in X-dimension
|
||||
attribute[0].val.clusterDim.y = (glwe_dimension + 1);
|
||||
attribute[0].val.clusterDim.z = 1;
|
||||
config.attrs = attribute;
|
||||
config.numAttrs = 1;
|
||||
config.stream = stream->stream;
|
||||
|
||||
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
|
||||
config.dynamicSmemBytes = minimum_sm_tbc;
|
||||
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config, device_programmable_bootstrap_tbc<Torus, params, NOSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
|
||||
supports_dsm));
|
||||
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
|
||||
config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;
|
||||
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config, device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
partial_dm, supports_dsm));
|
||||
} else {
|
||||
config.dynamicSmemBytes = full_sm + minimum_sm_tbc;
|
||||
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config, device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
|
||||
supports_dsm));
|
||||
}
|
||||
}
|
||||
|
||||
// Verify if the grid size satisfies the cooperative group constraints
|
||||
template <typename Torus, class params>
|
||||
__host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
|
||||
int glwe_dimension, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
// If Cooperative Groups is not supported, no need to check anything else
|
||||
if (!cuda_check_support_cooperative_groups())
|
||||
return false;
|
||||
|
||||
// Calculate the dimension of the kernel
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(params::degree);
|
||||
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
|
||||
params::degree);
|
||||
|
||||
int thds = params::degree / params::opt;
|
||||
|
||||
// Get the maximum number of active blocks per streaming multiprocessors
|
||||
int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
|
||||
int max_active_blocks_per_sm;
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_programmable_bootstrap_tbc<Torus, params, NOSM>, thds,
|
||||
0);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
|
||||
thds, partial_sm);
|
||||
} else {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_programmable_bootstrap_tbc<Torus, params, FULLSM>, thds,
|
||||
full_sm);
|
||||
}
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory) {
|
||||
uint64_t minimum_sm =
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
if (max_shared_memory < minimum_sm) {
|
||||
// If we cannot store a single polynomial in a block shared memory we cannot
|
||||
// use TBC
|
||||
return false;
|
||||
} else {
|
||||
return cuda_check_support_thread_block_clusters();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
|
||||
if (!cuda_check_support_thread_block_clusters() || num_samples > 128)
|
||||
return false;
|
||||
|
||||
uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t minimum_sm_tbc = 0;
|
||||
if (supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory))
|
||||
minimum_sm_tbc =
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int cluster_size;
|
||||
|
||||
dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
cudaLaunchConfig_t config = {0};
|
||||
// The grid dimension is not affected by cluster launch, and is still
|
||||
// enumerated using number of blocks. The grid dimension should be a multiple
|
||||
// of cluster size.
|
||||
config.gridDim = grid_accumulate;
|
||||
config.blockDim = thds;
|
||||
config.numAttrs = 0;
|
||||
|
||||
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size, device_programmable_bootstrap_tbc<Torus, params, NOSM>,
|
||||
&config));
|
||||
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>, &config));
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size, device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
|
||||
&config));
|
||||
}
|
||||
|
||||
return cluster_size >= level_count * (glwe_dimension + 1);
|
||||
}
|
||||
|
||||
#endif // CG_PBS_H
|
||||
@@ -0,0 +1,480 @@
|
||||
#ifndef CUDA_TBC_MULTIBIT_PBS_CUH
|
||||
#define CUDA_TBC_MULTIBIT_PBS_CUH
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/ggsw.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_multibit.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
double2 *keybundle_array, double2 *join_buffer, Torus *global_accumulator,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t lwe_offset, uint32_t lwe_chunk_size,
|
||||
uint32_t keybundle_size_per_input, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, bool support_dsm) {
|
||||
|
||||
cluster_group cluster = this_cluster();
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory;
|
||||
|
||||
if constexpr (SMD == FULLSM) {
|
||||
// The first (polynomial_size/2) * sizeof(double2) bytes are reserved for
|
||||
// external product using distributed shared memory
|
||||
selected_memory = sharedmem;
|
||||
if (support_dsm)
|
||||
selected_memory += sizeof(Torus) * polynomial_size;
|
||||
} else {
|
||||
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
|
||||
blockIdx.z * gridDim.x * gridDim.y;
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
}
|
||||
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
double2 *accumulator_fft =
|
||||
(double2 *)accumulator +
|
||||
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
|
||||
|
||||
if constexpr (SMD == PARTIALSM) {
|
||||
accumulator_fft = (double2 *)sharedmem;
|
||||
if (support_dsm)
|
||||
accumulator_fft += sizeof(double2) * (polynomial_size / 2);
|
||||
}
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
|
||||
|
||||
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
|
||||
double2 *block_join_buffer =
|
||||
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
|
||||
params::degree / 2];
|
||||
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
|
||||
|
||||
double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
blockIdx.z * keybundle_size_per_input;
|
||||
|
||||
if (lwe_offset == 0) {
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
global_slice, accumulator);
|
||||
}
|
||||
|
||||
for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, cluster_group, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, keybundle,
|
||||
polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
}
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, global_slice);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // distributed shared memory
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // accumulator
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size * 2; // accumulator
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t grouping_factor, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
bool supports_dsm =
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_tbc_accumulate =
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm_tbc_accumulate =
|
||||
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t minimum_sm_tbc_accumulate = 0;
|
||||
if (supports_dsm)
|
||||
minimum_sm_tbc_accumulate =
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
if (max_shared_memory < full_sm_keybundle) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (max_shared_memory <
|
||||
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
minimum_sm_tbc_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory <
|
||||
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
stream->gpu_index, input_lwe_ciphertext_count, polynomial_size,
|
||||
max_shared_memory);
|
||||
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
|
||||
stream, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_tbc_external_product_loop(
|
||||
cuda_stream_t *stream, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
|
||||
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t lwe_chunk_size, uint32_t max_shared_memory,
|
||||
int lwe_offset) {
|
||||
|
||||
auto supports_dsm =
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
|
||||
uint64_t full_dm =
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_dm =
|
||||
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t minimum_dm = 0;
|
||||
if (supports_dsm)
|
||||
minimum_dm =
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
uint32_t keybundle_size_per_input =
|
||||
lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2);
|
||||
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
|
||||
auto d_mem = buffer->d_mem_acc_tbc;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
auto global_accumulator = buffer->global_accumulator;
|
||||
auto buffer_fft = buffer->global_accumulator_fft;
|
||||
|
||||
dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
cudaLaunchConfig_t config = {0};
|
||||
// The grid dimension is not affected by cluster launch, and is still
|
||||
// enumerated using number of blocks. The grid dimension should be a multiple
|
||||
// of cluster size.
|
||||
config.gridDim = grid_accumulate;
|
||||
config.blockDim = thds;
|
||||
|
||||
cudaLaunchAttribute attribute[1];
|
||||
attribute[0].id = cudaLaunchAttributeClusterDimension;
|
||||
attribute[0].val.clusterDim.x = level_count; // Cluster size in X-dimension
|
||||
attribute[0].val.clusterDim.y = (glwe_dimension + 1);
|
||||
attribute[0].val.clusterDim.z = 1;
|
||||
config.attrs = attribute;
|
||||
config.numAttrs = 1;
|
||||
config.stream = stream->stream;
|
||||
|
||||
if (max_shared_memory < partial_dm + minimum_dm) {
|
||||
config.dynamicSmemBytes = minimum_dm;
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
|
||||
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count, grouping_factor, lwe_offset, chunk_size,
|
||||
keybundle_size_per_input, d_mem, full_dm, supports_dsm));
|
||||
} else if (max_shared_memory < full_dm + minimum_dm) {
|
||||
config.dynamicSmemBytes = partial_dm + minimum_dm;
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
|
||||
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count, grouping_factor, lwe_offset, chunk_size,
|
||||
keybundle_size_per_input, d_mem, partial_dm, supports_dsm));
|
||||
} else {
|
||||
config.dynamicSmemBytes = full_dm + minimum_dm;
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
|
||||
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count, grouping_factor, lwe_offset, chunk_size,
|
||||
keybundle_size_per_input, d_mem, 0, supports_dsm));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
__host__ void host_tbc_multi_bit_programmable_bootstrap(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size = 0) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
stream->gpu_index, num_samples, polynomial_size, max_shared_memory);
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Compute a keybundle
|
||||
execute_compute_keybundle<Torus, params>(
|
||||
stream, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, max_shared_memory,
|
||||
lwe_chunk_size, lwe_offset);
|
||||
|
||||
// Accumulate
|
||||
execute_tbc_external_product_loop<Torus, params>(
|
||||
stream, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
lwe_chunk_size, max_shared_memory, lwe_offset);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory) {
|
||||
uint64_t minimum_sm =
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
if (max_shared_memory <= minimum_sm) {
|
||||
// If we cannot store a single polynomial in a block shared memory we
|
||||
// cannot use TBC
|
||||
return false;
|
||||
} else {
|
||||
return cuda_check_support_thread_block_clusters();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
|
||||
if (!cuda_check_support_thread_block_clusters())
|
||||
return false;
|
||||
|
||||
uint64_t full_sm_tbc_accumulate =
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm_tbc_accumulate =
|
||||
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t minimum_sm_tbc_accumulate = 0;
|
||||
if (supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory))
|
||||
minimum_sm_tbc_accumulate =
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int cluster_size;
|
||||
|
||||
dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
cudaLaunchConfig_t config = {0};
|
||||
// The grid dimension is not affected by cluster launch, and is still
|
||||
// enumerated using number of blocks. The grid dimension should be a multiple
|
||||
// of cluster size.
|
||||
config.gridDim = grid_accumulate;
|
||||
config.blockDim = thds;
|
||||
config.numAttrs = 0;
|
||||
|
||||
if (max_shared_memory <
|
||||
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
&config));
|
||||
} else if (max_shared_memory <
|
||||
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
&config));
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
&config));
|
||||
}
|
||||
|
||||
return cluster_size >= level_count * (glwe_dimension + 1);
|
||||
}
|
||||
|
||||
template __host__ bool
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<uint64_t>(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
#endif // FASTMULTIBIT_PBS_H
|
||||
@@ -50,12 +50,7 @@ public:
|
||||
constexpr static int opt = choose_opt_amortized(N);
|
||||
constexpr static int log2_degree = log2(N);
|
||||
};
|
||||
enum sharedMemDegree {
|
||||
NOSM = 0,
|
||||
PARTIALSM = 1,
|
||||
FULLSM = 2
|
||||
|
||||
};
|
||||
enum sharedMemDegree { NOSM = 0, PARTIALSM = 1, FULLSM = 2 };
|
||||
|
||||
class ForwardFFT {
|
||||
public:
|
||||
|
||||
@@ -167,6 +167,38 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
|
||||
(benchmark::State &st) {
|
||||
if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
pbs_level, cuda_get_max_shared_memory(stream->gpu_index))) {
|
||||
st.SkipWithError("Configuration not supported for tbc operation");
|
||||
return;
|
||||
}
|
||||
|
||||
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_level, grouping_factor,
|
||||
input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index),
|
||||
true, chunk_size);
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
stream, d_lwe_ct_out_array, d_lwe_output_indexes, d_lut_pbs_identity,
|
||||
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
|
||||
pbs_level, input_lwe_ciphertext_count, 1, 0,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), chunk_size);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap(stream, &buffer);
|
||||
}
|
||||
#endif
|
||||
|
||||
BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
|
||||
(benchmark::State &st) {
|
||||
if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
@@ -221,6 +253,39 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap(stream, &buffer);
|
||||
}
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
|
||||
(benchmark::State &st) {
|
||||
if (!has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
pbs_level, cuda_get_max_shared_memory(stream->gpu_index))) {
|
||||
st.SkipWithError("Configuration not supported for tbc operation");
|
||||
return;
|
||||
}
|
||||
|
||||
scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
|
||||
stream, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer, glwe_dimension,
|
||||
polynomial_size, pbs_level, input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), true);
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
stream, (uint64_t *)d_lwe_ct_out_array,
|
||||
(uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
|
||||
(uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
|
||||
(uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
input_lwe_ciphertext_count, 1, 0,
|
||||
cuda_get_max_shared_memory(stream->gpu_index));
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
|
||||
cleanup_cuda_programmable_bootstrap(stream, &buffer);
|
||||
}
|
||||
#endif
|
||||
|
||||
BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
|
||||
(benchmark::State &st) {
|
||||
if (!has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
@@ -317,6 +382,9 @@ MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
for (auto x : params) {
|
||||
for (int input_lwe_ciphertext_count = 1; input_lwe_ciphertext_count <= 4096;
|
||||
input_lwe_ciphertext_count *= 2) {
|
||||
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
|
||||
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
|
||||
x.grouping_factor, 0});
|
||||
for (int lwe_chunk_size = 1;
|
||||
lwe_chunk_size <= x.lwe_dimension / x.grouping_factor;
|
||||
lwe_chunk_size *= 2)
|
||||
@@ -332,6 +400,25 @@ MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
CGBootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
// Define the parameters to benchmark
|
||||
// lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
// input_lwe_ciphertext_count
|
||||
|
||||
// PARAM_MESSAGE_2_CARRY_2_KS_PBS
|
||||
std::vector<BootstrapBenchmarkParams> params = {
|
||||
(BootstrapBenchmarkParams){742, 1, 2048, 23, 1, 1},
|
||||
};
|
||||
|
||||
// Add to the list of parameters to benchmark
|
||||
for (int num_samples = 1; num_samples <= 4096; num_samples *= 2)
|
||||
for (auto x : params) {
|
||||
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
|
||||
x.pbs_base_log, x.pbs_level, num_samples});
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
// Define the parameters to benchmark
|
||||
@@ -351,6 +438,14 @@ BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
}
|
||||
}
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, TbcMultiBit)
|
||||
->Apply(MultiBitPBSBenchmarkGenerateParams)
|
||||
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
|
||||
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
|
||||
"grouping_factor", "chunk_size"});
|
||||
#endif
|
||||
|
||||
BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, CgMultiBit)
|
||||
->Apply(MultiBitPBSBenchmarkGenerateParams)
|
||||
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
|
||||
@@ -363,11 +458,23 @@ BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, DefaultMultiBit)
|
||||
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
|
||||
"grouping_factor", "chunk_size"});
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, TbcPBC)
|
||||
->Apply(BootstrapBenchmarkGenerateParams)
|
||||
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
|
||||
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
|
||||
#endif
|
||||
|
||||
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, DefaultPBS)
|
||||
->Apply(BootstrapBenchmarkGenerateParams)
|
||||
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
|
||||
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
|
||||
|
||||
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, CgPBS)
|
||||
->Apply(BootstrapBenchmarkGenerateParams)
|
||||
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
|
||||
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
|
||||
|
||||
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, AmortizedPBS)
|
||||
->Apply(BootstrapBenchmarkGenerateParams)
|
||||
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#ifndef UTILS_H
|
||||
#define UTILS_H
|
||||
|
||||
#include "tfhe.h"
|
||||
#include <device.h>
|
||||
#include <functional>
|
||||
#include <tfhe.h>
|
||||
|
||||
typedef struct Seed {
|
||||
uint64_t lo;
|
||||
|
||||
@@ -205,7 +205,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
|
||||
uint64_t decrypted = 0;
|
||||
core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
|
||||
glwe_dimension * polynomial_size);
|
||||
EXPECT_NE(decrypted, plaintext);
|
||||
ASSERT_NE(decrypted, plaintext);
|
||||
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
|
||||
// plaintext
|
||||
// - decrypted;
|
||||
@@ -216,7 +216,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, plaintext / delta);
|
||||
ASSERT_EQ(decoded, plaintext / delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -227,7 +227,13 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
|
||||
888, 1, 16384,
|
||||
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
|
||||
128, 3, 1, 10});
|
||||
128, 3, 1, 10},
|
||||
(MultiBitProgrammableBootstrapTestParams){
|
||||
972, 1, 8192,
|
||||
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 14, 2, 8, 8,
|
||||
68, 3, 1, 1});
|
||||
|
||||
std::string printParamName(
|
||||
::testing::TestParamInfo<MultiBitProgrammableBootstrapTestParams> p) {
|
||||
MultiBitProgrammableBootstrapTestParams params = p.param;
|
||||
|
||||
@@ -687,6 +687,37 @@ extern "C" {
|
||||
|
||||
pub fn cleanup_cuda_full_propagation(v_stream: *const c_void, mem_ptr: *mut *mut i8);
|
||||
|
||||
pub fn scratch_cuda_apply_univariate_lut_kb_64(
|
||||
v_stream: *const c_void,
|
||||
mem_ptr: *mut *mut i8,
|
||||
input_lut: *const c_void,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_apply_univariate_lut_kb_64(
|
||||
v_stream: *const c_void,
|
||||
output_radix_lwe: *mut c_void,
|
||||
input_radix_lwe: *const c_void,
|
||||
mem_ptr: *mut i8,
|
||||
ksk: *const c_void,
|
||||
bsk: *const c_void,
|
||||
num_blocks: u32,
|
||||
);
|
||||
|
||||
pub fn cleanup_cuda_apply_univariate_lut_kb_64(v_stream: *const c_void, mem_ptr: *mut *mut i8);
|
||||
|
||||
pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
v_stream: *const c_void,
|
||||
mem_ptr: *mut *mut i8,
|
||||
@@ -959,4 +990,37 @@ extern "C" {
|
||||
mem_ptr: *mut *mut i8,
|
||||
);
|
||||
|
||||
pub fn scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
v_stream: *const c_void,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
v_stream: *const c_void,
|
||||
quotient: *mut c_void,
|
||||
remainder: *mut c_void,
|
||||
numerator: *const c_void,
|
||||
divisor: *const c_void,
|
||||
mem_ptr: *mut i8,
|
||||
bsk: *const c_void,
|
||||
ksk: *const c_void,
|
||||
num_blocks: u32,
|
||||
);
|
||||
|
||||
pub fn cleanup_cuda_integer_div_rem(v_stream: *const c_void, mem_ptr: *mut *mut i8);
|
||||
|
||||
} // extern "C"
|
||||
|
||||
@@ -57,7 +57,7 @@ def check_security(filename):
|
||||
estimator_level = log(min(usvp_level["rop"], dual_level["rop"]),2 )
|
||||
security_level = f"security level = {estimator_level} bits"
|
||||
if estimator_level < 127:
|
||||
print("FAIL\t({security_level})")
|
||||
print(f"FAIL\t({security_level})")
|
||||
reason = f"attained {security_level} target is 128 bits"
|
||||
to_update.append((param, reason))
|
||||
continue
|
||||
|
||||
@@ -8,11 +8,13 @@ function usage() {
|
||||
echo "--help Print this message"
|
||||
echo "--build-only Pass to only build the tests without running them"
|
||||
echo "--gpu Enable GPU support"
|
||||
echo "--cargo-profile The profile used to build TFHE-rs, release by default"
|
||||
echo
|
||||
}
|
||||
|
||||
BUILD_ONLY=0
|
||||
WITH_FEATURE_GPU="OFF"
|
||||
CARGO_PROFILE="release"
|
||||
while [ -n "$1" ]
|
||||
do
|
||||
case "$1" in
|
||||
@@ -28,6 +30,12 @@ do
|
||||
"--gpu" )
|
||||
WITH_FEATURE_GPU="ON"
|
||||
;;
|
||||
|
||||
"--cargo-profile" )
|
||||
shift
|
||||
CARGO_PROFILE="$1"
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Unknown param : $1"
|
||||
exit 1
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![allow(non_local_definitions)]
|
||||
|
||||
use ark_ec::bls12::{Bls12, Bls12Config, TwistType};
|
||||
use ark_ff::fields::*;
|
||||
use ark_ff::MontFp;
|
||||
@@ -576,8 +578,8 @@ pub mod g1 {
|
||||
writer.write_all(&bytes)?;
|
||||
} else {
|
||||
let mut bytes = [0u8; 2 * G1_SERIALIZED_SIZE];
|
||||
bytes[1..1 + G1_SERIALIZED_SIZE].copy_from_slice(&x_bytes[..]);
|
||||
bytes[2 + G1_SERIALIZED_SIZE..].copy_from_slice(&serialize_fq(p.y)[..]);
|
||||
bytes[1..G1_SERIALIZED_SIZE].copy_from_slice(&x_bytes[..]);
|
||||
bytes[1 + G1_SERIALIZED_SIZE..].copy_from_slice(&serialize_fq(p.y)[..]);
|
||||
|
||||
encoding.encode_flags(&mut bytes);
|
||||
writer.write_all(&bytes)?;
|
||||
|
||||
@@ -91,6 +91,21 @@ pub struct PrivateCommit<G: Curve> {
|
||||
__marker: PhantomData<G>,
|
||||
}
|
||||
|
||||
pub fn compute_crs_params(
|
||||
d: usize,
|
||||
k: usize,
|
||||
b: u64,
|
||||
_q: u64, // we keep q here to make sure the API is consistent with [crs_gen]
|
||||
t: u64,
|
||||
) -> (usize, usize, u64) {
|
||||
let b_r = d as u64 / 2 + 1;
|
||||
|
||||
let big_d =
|
||||
d + k * t.ilog2() as usize + (d + k) * (2 + b.ilog2() as usize + b_r.ilog2() as usize);
|
||||
let n = big_d + 1;
|
||||
(n, big_d, b_r)
|
||||
}
|
||||
|
||||
pub fn crs_gen<G: Curve>(
|
||||
d: usize,
|
||||
k: usize,
|
||||
@@ -100,11 +115,7 @@ pub fn crs_gen<G: Curve>(
|
||||
rng: &mut dyn RngCore,
|
||||
) -> PublicParams<G> {
|
||||
let alpha = G::Zp::rand(rng);
|
||||
let b_r = d as u64 / 2 + 1;
|
||||
|
||||
let big_d =
|
||||
d + k * t.ilog2() as usize + (d + k) * (2 + b.ilog2() as usize + b_r.ilog2() as usize);
|
||||
let n = big_d + 1;
|
||||
let (n, big_d, b_r) = compute_crs_params(d, k, b, q, t);
|
||||
PublicParams {
|
||||
g_lists: GroupElements::<G>::new(n, alpha),
|
||||
big_d,
|
||||
|
||||
@@ -11,6 +11,7 @@ use tfhe::keycache::NamedParam;
|
||||
use tfhe::shortint::parameters::{
|
||||
PARAM_GPU_MULTI_BIT_MESSAGE_1_CARRY_1_GROUP_3_KS_PBS,
|
||||
PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS,
|
||||
PARAM_GPU_MULTI_BIT_MESSAGE_3_CARRY_3_GROUP_3_KS_PBS,
|
||||
};
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
use tfhe::shortint::parameters::{
|
||||
@@ -42,9 +43,10 @@ const SHORTINT_MULTI_BIT_BENCH_PARAMS: [MultiBitPBSParameters; 6] = [
|
||||
];
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
const SHORTINT_MULTI_BIT_BENCH_PARAMS: [MultiBitPBSParameters; 2] = [
|
||||
const SHORTINT_MULTI_BIT_BENCH_PARAMS: [MultiBitPBSParameters; 3] = [
|
||||
PARAM_GPU_MULTI_BIT_MESSAGE_1_CARRY_1_GROUP_3_KS_PBS,
|
||||
PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS,
|
||||
PARAM_GPU_MULTI_BIT_MESSAGE_3_CARRY_3_GROUP_3_KS_PBS,
|
||||
];
|
||||
|
||||
const BOOLEAN_BENCH_PARAMS: [(&str, BooleanParameters); 2] = [
|
||||
|
||||
@@ -39,9 +39,9 @@ impl Default for ParamsAndNumBlocksIter {
|
||||
|
||||
if env_config.is_multi_bit {
|
||||
#[cfg(feature = "gpu")]
|
||||
let params = vec![PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS.into()];
|
||||
let params = vec![PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS.into()];
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
let params = vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS.into()];
|
||||
let params = vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS.into()];
|
||||
|
||||
let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
|
||||
Self {
|
||||
@@ -1538,6 +1538,11 @@ mod cuda {
|
||||
display_name: mul
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: unchecked_div_rem,
|
||||
display_name: div_rem
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: unchecked_add,
|
||||
display_name: add
|
||||
@@ -1720,6 +1725,11 @@ mod cuda {
|
||||
display_name: mul
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: div_rem,
|
||||
display_name: div_rem
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_fn!(
|
||||
method_name: ne,
|
||||
display_name: not_equal
|
||||
@@ -1898,6 +1908,7 @@ mod cuda {
|
||||
cuda_unchecked_bitor,
|
||||
cuda_unchecked_bitxor,
|
||||
cuda_unchecked_mul,
|
||||
cuda_unchecked_div_rem,
|
||||
cuda_unchecked_sub,
|
||||
cuda_unchecked_unsigned_overflowing_sub,
|
||||
cuda_unchecked_add,
|
||||
@@ -1938,6 +1949,7 @@ mod cuda {
|
||||
cuda_unsigned_overflowing_sub,
|
||||
cuda_add,
|
||||
cuda_mul,
|
||||
cuda_div_rem,
|
||||
cuda_eq,
|
||||
cuda_ne,
|
||||
cuda_ge,
|
||||
@@ -1982,7 +1994,7 @@ mod cuda {
|
||||
display_name: &str,
|
||||
cast_op: F,
|
||||
) where
|
||||
F: Fn(&CudaServerKey, CudaUnsignedRadixCiphertext, usize),
|
||||
F: Fn(&CudaServerKey, CudaUnsignedRadixCiphertext, usize, &CudaStream),
|
||||
{
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
@@ -2021,7 +2033,7 @@ mod cuda {
|
||||
b.iter_batched(
|
||||
encrypt_one_value,
|
||||
|ct| {
|
||||
cast_op(&gpu_sks, ct, target_num_blocks);
|
||||
cast_op(&gpu_sks, ct, target_num_blocks, &stream);
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
@@ -2046,12 +2058,12 @@ mod cuda {
|
||||
(method_name: $server_key_method:ident, display_name:$name:ident) => {
|
||||
::paste::paste!{
|
||||
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
|
||||
bench_server_key_cast_function(
|
||||
cuda_bench_server_key_cast_function(
|
||||
c,
|
||||
concat!("integer::cuda::", stringify!($server_key_method)),
|
||||
stringify!($name),
|
||||
|server_key, lhs, rhs| {
|
||||
server_key.$server_key_method(lhs, rhs);
|
||||
|server_key, lhs, rhs, stream| {
|
||||
server_key.$server_key_method(lhs, rhs, stream);
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -2406,7 +2418,7 @@ fn go_through_gpu_bench_groups(val: &str) {
|
||||
"default" => {
|
||||
default_cuda_ops();
|
||||
default_scalar_cuda_ops();
|
||||
cuda_cast_ops()
|
||||
cuda_cast_ops();
|
||||
}
|
||||
"unchecked" => {
|
||||
unchecked_cuda_ops();
|
||||
|
||||
@@ -826,7 +826,7 @@ macro_rules! define_server_key_bench_binary_scalar_clean_inputs_fn (
|
||||
fn $server_key_method(c: &mut Criterion) {
|
||||
bench_server_key_binary_scalar_function_clean_inputs(
|
||||
c,
|
||||
concat!("integer::", stringify!($server_key_method)),
|
||||
concat!("integer::signed::", stringify!($server_key_method)),
|
||||
stringify!($name),
|
||||
|server_key, lhs, rhs| {
|
||||
server_key.$server_key_method(lhs, rhs);
|
||||
@@ -2147,11 +2147,106 @@ mod cuda {
|
||||
cuda_scalar_min,
|
||||
cuda_scalar_max,
|
||||
);
|
||||
|
||||
fn cuda_bench_server_key_signed_cast_function<F>(
|
||||
c: &mut Criterion,
|
||||
bench_name: &str,
|
||||
display_name: &str,
|
||||
cast_op: F,
|
||||
) where
|
||||
F: Fn(&CudaServerKey, CudaSignedRadixCiphertext, usize, &CudaStream),
|
||||
{
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
.sample_size(15)
|
||||
.measurement_time(std::time::Duration::from_secs(30));
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
let env_config = EnvConfig::new();
|
||||
let gpu_index = 0;
|
||||
let device = CudaDevice::new(gpu_index);
|
||||
let stream = CudaStream::new_unchecked(device);
|
||||
|
||||
for (param, num_blocks, bit_size) in ParamsAndNumBlocksIter::default() {
|
||||
let all_num_blocks = env_config
|
||||
.bit_sizes()
|
||||
.iter()
|
||||
.copied()
|
||||
.map(|bit| bit.div_ceil(param.message_modulus().0.ilog2() as usize))
|
||||
.collect::<Vec<_>>();
|
||||
let param_name = param.name();
|
||||
|
||||
for target_num_blocks in all_num_blocks.iter().copied() {
|
||||
let target_bit_size =
|
||||
target_num_blocks * param.message_modulus().0.ilog2() as usize;
|
||||
let bench_id =
|
||||
format!("{bench_name}::{param_name}::{bit_size}_to_{target_bit_size}");
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, _sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
let encrypt_one_value = || -> CudaSignedRadixCiphertext {
|
||||
let ct = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_blocks);
|
||||
CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct, &stream)
|
||||
};
|
||||
|
||||
b.iter_batched(
|
||||
encrypt_one_value,
|
||||
|ct| {
|
||||
cast_op(&gpu_sks, ct, target_num_blocks, &stream);
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
|
||||
write_to_json::<u64, _>(
|
||||
&bench_id,
|
||||
param,
|
||||
param.name(),
|
||||
display_name,
|
||||
&OperatorType::Atomic,
|
||||
bit_size as u32,
|
||||
vec![param.message_modulus().0.ilog2(); num_blocks],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
bench_group.finish()
|
||||
}
|
||||
|
||||
macro_rules! define_cuda_server_key_bench_signed_cast_fn (
|
||||
(method_name: $server_key_method:ident, display_name:$name:ident) => {
|
||||
::paste::paste!{
|
||||
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
|
||||
cuda_bench_server_key_signed_cast_function(
|
||||
c,
|
||||
concat!("integer::cuda::signed::", stringify!($server_key_method)),
|
||||
stringify!($name),
|
||||
|server_key, lhs, rhs, stream| {
|
||||
server_key.$server_key_method(lhs, rhs, stream);
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_signed_cast_fn!(
|
||||
method_name: cast_to_unsigned,
|
||||
display_name: cast_to_unsigned
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_signed_cast_fn!(
|
||||
method_name: cast_to_signed,
|
||||
display_name: cast_to_signed
|
||||
);
|
||||
|
||||
criterion_group!(cuda_cast_ops, cuda_cast_to_unsigned, cuda_cast_to_signed);
|
||||
}
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
use cuda::{
|
||||
default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops, unchecked_scalar_cuda_ops,
|
||||
cuda_cast_ops, default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops,
|
||||
unchecked_scalar_cuda_ops,
|
||||
};
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
@@ -2160,6 +2255,7 @@ fn go_through_gpu_bench_groups(val: &str) {
|
||||
"default" => {
|
||||
default_cuda_ops();
|
||||
default_scalar_cuda_ops();
|
||||
cuda_cast_ops();
|
||||
}
|
||||
"unchecked" => {
|
||||
unchecked_cuda_ops();
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
// If this test break the c_api doc needs to be updated
|
||||
|
||||
#include "tfhe.h"
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <tfhe.h>
|
||||
|
||||
int main(void) {
|
||||
int ok = 0;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#include <tfhe.h>
|
||||
#include "tfhe.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#include <tfhe.h>
|
||||
#include "tfhe.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#include <tfhe.h>
|
||||
#include "tfhe.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#include <tfhe.h>
|
||||
#include "tfhe.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#include <tfhe.h>
|
||||
#include "tfhe.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#if defined(WITH_FEATURE_GPU)
|
||||
#include <tfhe.h>
|
||||
#include "tfhe.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
* - NUM_CPU: number of CPU threads (tfhe internally automatically creates them)
|
||||
*/
|
||||
|
||||
#include <tfhe.h>
|
||||
#include "tfhe.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <pthread.h>
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#include "tfhe.h"
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <tfhe.h>
|
||||
|
||||
int main(void) {
|
||||
// We want to use zk-proof, which requires bounded random distributions
|
||||
|
||||
@@ -4,7 +4,7 @@ The first step is the creation of the configuration. The configuration is used t
|
||||
|
||||
A configuration can be created by using the ConfigBuilder type.
|
||||
|
||||
In this example, 8-bit unsigned integers with default parameters are used. The `integers` feature must also be enabled, as per the table on [this page](broken-reference).
|
||||
In this example, 8-bit unsigned integers with default parameters are used. The `integers` feature must also be enabled, as per the table on [this page](../guides/rust\_configuration.md#homomorphic-types).
|
||||
|
||||
The config is generated by first creating a builder with all types deactivated. Then, the integer types with default parameters are activated, since we are going to use FheUint8 values.
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
The basic steps for using the high-level API of TFHE-rs are:
|
||||
|
||||
1. [Importing the TFHE-rs prelude](quick\_start.md#imports);
|
||||
2. Client-side: [Configuring and creating keys](../fundamentals/configure-and-create-keys.md);
|
||||
2. Client-side: [Configuring and creating keys](../fundamentals/configure-and-generate-keys.md);
|
||||
3. Client-side: [Encrypting data](../fundamentals/encrypt-data.md);
|
||||
4. Server-side: [Setting the server key](../fundamentals/set-the-server-key.md);
|
||||
5. Server-side: [Computing over encrypted data](../fundamentals/compute.md);
|
||||
|
||||
@@ -79,7 +79,7 @@ $
|
||||
|
||||
```c
|
||||
|
||||
#include <tfhe.h>
|
||||
#include "tfhe.h"
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
@@ -35,8 +35,7 @@ function fhe_uint32_example() {
|
||||
init_panic_hook();
|
||||
|
||||
const block_params = new ShortintParameters(ShortintParametersName.PARAM_SMALL_MESSAGE_2_CARRY_2_COMPACT_PK);
|
||||
let config = TfheConfigBuilder.all_disabled()
|
||||
.enable_default_integers()
|
||||
let config = TfheConfigBuilder.default()
|
||||
.build();
|
||||
|
||||
let clientKey = TfheClientKey.generate(config);
|
||||
|
||||
@@ -201,7 +201,7 @@ All operations follow the same syntax than the one described in [here](../gettin
|
||||
# Benchmarks
|
||||
|
||||
All GPU benchmarks presented here were obtained on a single H100 GPU, and rely on the multithreaded PBS algorithm.
|
||||
The cryptographic parameters PARAM\_GPU\_MULTI\_BIT\_MESSAGE\_1\_CARRY\_2\_GROUP\_3\_KS\_PBS were used.
|
||||
The cryptographic parameters PARAM\_GPU\_MULTI\_BIT\_MESSAGE\_2\_CARRY\_2\_GROUP\_3\_KS\_PBS were used.
|
||||
Performance is the following when the inputs of the benchmarked operation are encrypted:
|
||||
|
||||
| Operation \ Size | `FheUint7` | `FheUint16` | `FheUint32` | `FheUint64` | `FheUint128` | `FheUint256` |
|
||||
|
||||
@@ -170,10 +170,10 @@ pub unsafe extern "C" fn core_crypto_lwe_encrypt(
|
||||
let seed = (seed_high_bytes << 64) | seed_low_bytes;
|
||||
|
||||
let seed = Seed(seed);
|
||||
let mut determinisitic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
|
||||
let mut deterministic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
|
||||
let mut encryption_generator = EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
|
||||
determinisitic_seeder.seed(),
|
||||
&mut determinisitic_seeder,
|
||||
deterministic_seeder.seed(),
|
||||
&mut deterministic_seeder,
|
||||
);
|
||||
|
||||
let plaintext = Plaintext(pt);
|
||||
@@ -219,10 +219,10 @@ pub unsafe extern "C" fn core_crypto_ggsw_encrypt(
|
||||
let seed = (seed_high_bytes << 64) | seed_low_bytes;
|
||||
|
||||
let seed = Seed(seed);
|
||||
let mut determinisitic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
|
||||
let mut deterministic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
|
||||
let mut encryption_generator = EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
|
||||
determinisitic_seeder.seed(),
|
||||
&mut determinisitic_seeder,
|
||||
deterministic_seeder.seed(),
|
||||
&mut deterministic_seeder,
|
||||
);
|
||||
|
||||
let plaintext = Plaintext(pt);
|
||||
|
||||
@@ -3246,7 +3246,12 @@ mod test {
|
||||
|
||||
let mut thread_rng = rand::thread_rng();
|
||||
|
||||
for _ in 0..10_000 {
|
||||
#[cfg(not(tarpaulin))]
|
||||
let nb_tests = 10_000;
|
||||
#[cfg(tarpaulin)]
|
||||
let nb_tests = 1;
|
||||
|
||||
for _ in 0..nb_tests {
|
||||
let lwe_sk =
|
||||
LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_random_generator);
|
||||
|
||||
@@ -3305,7 +3310,12 @@ mod test {
|
||||
|
||||
let mut thread_rng = rand::thread_rng();
|
||||
|
||||
for _ in 0..100 {
|
||||
#[cfg(not(tarpaulin))]
|
||||
let nb_tests = 100;
|
||||
#[cfg(tarpaulin)]
|
||||
let nb_tests = 1;
|
||||
|
||||
for _ in 0..nb_tests {
|
||||
// We'll encrypt between 1 and 4 * lwe_dimension ciphertexts
|
||||
let ct_count: usize = thread_rng.gen();
|
||||
let ct_count = ct_count % (lwe_dimension.0 * 4) + 1;
|
||||
|
||||
@@ -206,7 +206,10 @@ where
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[cfg(not(tarpaulin))]
|
||||
const NB_TESTS: usize = 1_000_000_000;
|
||||
#[cfg(tarpaulin)]
|
||||
const NB_TESTS: usize = 1;
|
||||
|
||||
#[test]
|
||||
fn test_divide_funcs() {
|
||||
@@ -295,7 +298,7 @@ mod test {
|
||||
use rand::Rng;
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
for _ in 0..1_000_000_000 {
|
||||
for _ in 0..NB_TESTS {
|
||||
let value: u64 = rng.gen();
|
||||
// This is an easy case where we expect the values to match exactly, to cover other
|
||||
// cases we would be re coding the algorithms here.
|
||||
|
||||
@@ -993,7 +993,10 @@ fn lwe_compact_public_encrypt_decrypt_custom_mod<Scalar: UnsignedTorus>(
|
||||
}
|
||||
|
||||
create_parametrized_test!(lwe_compact_public_encrypt_decrypt_custom_mod {
|
||||
TEST_PARAMS_4_BITS_NATIVE_U64
|
||||
#[cfg(not(tarpaulin))]
|
||||
TEST_PARAMS_4_BITS_NATIVE_U64,
|
||||
#[cfg(tarpaulin)]
|
||||
COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U64
|
||||
});
|
||||
|
||||
#[cfg(feature = "zk-pok-experimental")]
|
||||
@@ -1095,7 +1098,10 @@ fn lwe_compact_public_encrypt_prove_verify_decrypt_custom_mod<Scalar>(
|
||||
|
||||
#[cfg(feature = "zk-pok-experimental")]
|
||||
create_parametrized_test!(lwe_compact_public_encrypt_prove_verify_decrypt_custom_mod {
|
||||
TEST_PARAMS_4_BITS_NATIVE_U64
|
||||
#[cfg(not(tarpaulin))]
|
||||
TEST_PARAMS_4_BITS_NATIVE_U64,
|
||||
#[cfg(tarpaulin)]
|
||||
COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U64
|
||||
});
|
||||
|
||||
#[cfg(feature = "zk-pok-experimental")]
|
||||
@@ -1316,3 +1322,219 @@ fn test_par_compact_lwe_list_public_key_encryption_and_proof() {
|
||||
assert_eq!(ser_lwe_ct_list, par_lwe_ct_list);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compact_public_key_encryption() {
|
||||
use rand::Rng;
|
||||
|
||||
#[cfg(not(tarpaulin))]
|
||||
let nb_tests = 10_000;
|
||||
#[cfg(tarpaulin)]
|
||||
let nb_tests = 100;
|
||||
|
||||
#[cfg(not(tarpaulin))]
|
||||
let lwe_dimension = LweDimension(2048);
|
||||
#[cfg(tarpaulin)]
|
||||
let lwe_dimension = LweDimension(1);
|
||||
|
||||
let glwe_noise_distribution =
|
||||
Gaussian::from_dispersion_parameter(StandardDev(0.00000000000000029403601535432533), 0.0);
|
||||
let ciphertext_modulus = CiphertextModulus::new_native();
|
||||
|
||||
let mut secret_random_generator = test_tools::new_secret_random_generator();
|
||||
let mut encryption_random_generator = test_tools::new_encryption_random_generator();
|
||||
|
||||
let mut thread_rng = rand::thread_rng();
|
||||
|
||||
for _ in 0..nb_tests {
|
||||
let lwe_sk = LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_random_generator);
|
||||
|
||||
let mut compact_lwe_pk = LweCompactPublicKey::new(0u64, lwe_dimension, ciphertext_modulus);
|
||||
|
||||
generate_lwe_compact_public_key(
|
||||
&lwe_sk,
|
||||
&mut compact_lwe_pk,
|
||||
glwe_noise_distribution,
|
||||
&mut encryption_random_generator,
|
||||
);
|
||||
|
||||
let msg: u64 = thread_rng.gen();
|
||||
let msg = msg % 16;
|
||||
|
||||
let plaintext = Plaintext(msg << 60);
|
||||
|
||||
let mut output_ct = LweCiphertext::new(
|
||||
0u64,
|
||||
lwe_dimension.to_lwe_size(),
|
||||
CiphertextModulus::new_native(),
|
||||
);
|
||||
|
||||
encrypt_lwe_ciphertext_with_compact_public_key(
|
||||
&compact_lwe_pk,
|
||||
&mut output_ct,
|
||||
plaintext,
|
||||
glwe_noise_distribution,
|
||||
glwe_noise_distribution,
|
||||
&mut secret_random_generator,
|
||||
&mut encryption_random_generator,
|
||||
);
|
||||
|
||||
let decrypted_plaintext = decrypt_lwe_ciphertext(&lwe_sk, &output_ct);
|
||||
|
||||
let signed_decomposer =
|
||||
SignedDecomposer::new(DecompositionBaseLog(4), DecompositionLevelCount(1));
|
||||
|
||||
let cleartext = signed_decomposer.closest_representable(decrypted_plaintext.0) >> 60;
|
||||
|
||||
assert_eq!(cleartext, msg);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_par_compact_lwe_list_public_key_encryption_equivalence() {
|
||||
use rand::Rng;
|
||||
|
||||
let lwe_dimension = LweDimension(2048);
|
||||
let glwe_noise_distribution =
|
||||
Gaussian::from_dispersion_parameter(StandardDev(0.00000000000000029403601535432533), 0.0);
|
||||
let ciphertext_modulus = CiphertextModulus::new_native();
|
||||
|
||||
let mut thread_rng = rand::thread_rng();
|
||||
|
||||
for _ in 0..NB_TESTS {
|
||||
// We'll encrypt between 1 and 4 * lwe_dimension ciphertexts
|
||||
let ct_count: usize = thread_rng.gen();
|
||||
let ct_count = ct_count % (lwe_dimension.0 * 4) + 1;
|
||||
let lwe_ciphertext_count = LweCiphertextCount(ct_count);
|
||||
|
||||
let seed = test_tools::random_seed();
|
||||
let mut input_plaintext_list =
|
||||
PlaintextList::new(0u64, PlaintextCount(lwe_ciphertext_count.0));
|
||||
input_plaintext_list.iter_mut().for_each(|x| {
|
||||
let msg: u64 = thread_rng.gen();
|
||||
*x.0 = (msg % 16) << 60;
|
||||
});
|
||||
|
||||
let par_lwe_ct_list = {
|
||||
let mut deterministic_seeder =
|
||||
DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
|
||||
let mut secret_random_generator =
|
||||
SecretRandomGenerator::<ActivatedRandomGenerator>::new(deterministic_seeder.seed());
|
||||
let mut encryption_random_generator =
|
||||
EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
|
||||
deterministic_seeder.seed(),
|
||||
&mut deterministic_seeder,
|
||||
);
|
||||
|
||||
let lwe_sk =
|
||||
LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_random_generator);
|
||||
|
||||
let mut compact_lwe_pk =
|
||||
LweCompactPublicKey::new(0u64, lwe_dimension, ciphertext_modulus);
|
||||
|
||||
generate_lwe_compact_public_key(
|
||||
&lwe_sk,
|
||||
&mut compact_lwe_pk,
|
||||
glwe_noise_distribution,
|
||||
&mut encryption_random_generator,
|
||||
);
|
||||
|
||||
let mut output_compact_ct_list = LweCompactCiphertextList::new(
|
||||
0u64,
|
||||
lwe_dimension.to_lwe_size(),
|
||||
lwe_ciphertext_count,
|
||||
ciphertext_modulus,
|
||||
);
|
||||
|
||||
par_encrypt_lwe_compact_ciphertext_list_with_compact_public_key(
|
||||
&compact_lwe_pk,
|
||||
&mut output_compact_ct_list,
|
||||
&input_plaintext_list,
|
||||
glwe_noise_distribution,
|
||||
glwe_noise_distribution,
|
||||
&mut secret_random_generator,
|
||||
&mut encryption_random_generator,
|
||||
);
|
||||
|
||||
let mut output_plaintext_list = input_plaintext_list.clone();
|
||||
output_plaintext_list.as_mut().fill(0u64);
|
||||
|
||||
let lwe_ciphertext_list = output_compact_ct_list.par_expand_into_lwe_ciphertext_list();
|
||||
|
||||
decrypt_lwe_ciphertext_list(&lwe_sk, &lwe_ciphertext_list, &mut output_plaintext_list);
|
||||
|
||||
let signed_decomposer =
|
||||
SignedDecomposer::new(DecompositionBaseLog(4), DecompositionLevelCount(1));
|
||||
|
||||
output_plaintext_list
|
||||
.iter_mut()
|
||||
.for_each(|x| *x.0 = signed_decomposer.closest_representable(*x.0));
|
||||
|
||||
assert_eq!(input_plaintext_list, output_plaintext_list);
|
||||
|
||||
lwe_ciphertext_list
|
||||
};
|
||||
|
||||
let ser_lwe_ct_list = {
|
||||
let mut deterministic_seeder =
|
||||
DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
|
||||
let mut secret_random_generator =
|
||||
SecretRandomGenerator::<ActivatedRandomGenerator>::new(deterministic_seeder.seed());
|
||||
let mut encryption_random_generator =
|
||||
EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
|
||||
deterministic_seeder.seed(),
|
||||
&mut deterministic_seeder,
|
||||
);
|
||||
|
||||
let lwe_sk =
|
||||
LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_random_generator);
|
||||
|
||||
let mut compact_lwe_pk =
|
||||
LweCompactPublicKey::new(0u64, lwe_dimension, ciphertext_modulus);
|
||||
|
||||
generate_lwe_compact_public_key(
|
||||
&lwe_sk,
|
||||
&mut compact_lwe_pk,
|
||||
glwe_noise_distribution,
|
||||
&mut encryption_random_generator,
|
||||
);
|
||||
|
||||
let mut output_compact_ct_list = LweCompactCiphertextList::new(
|
||||
0u64,
|
||||
lwe_dimension.to_lwe_size(),
|
||||
lwe_ciphertext_count,
|
||||
ciphertext_modulus,
|
||||
);
|
||||
|
||||
encrypt_lwe_compact_ciphertext_list_with_compact_public_key(
|
||||
&compact_lwe_pk,
|
||||
&mut output_compact_ct_list,
|
||||
&input_plaintext_list,
|
||||
glwe_noise_distribution,
|
||||
glwe_noise_distribution,
|
||||
&mut secret_random_generator,
|
||||
&mut encryption_random_generator,
|
||||
);
|
||||
|
||||
let mut output_plaintext_list = input_plaintext_list.clone();
|
||||
output_plaintext_list.as_mut().fill(0u64);
|
||||
|
||||
let lwe_ciphertext_list = output_compact_ct_list.expand_into_lwe_ciphertext_list();
|
||||
|
||||
decrypt_lwe_ciphertext_list(&lwe_sk, &lwe_ciphertext_list, &mut output_plaintext_list);
|
||||
|
||||
let signed_decomposer =
|
||||
SignedDecomposer::new(DecompositionBaseLog(4), DecompositionLevelCount(1));
|
||||
|
||||
output_plaintext_list
|
||||
.iter_mut()
|
||||
.for_each(|x| *x.0 = signed_decomposer.closest_representable(*x.0));
|
||||
|
||||
assert_eq!(input_plaintext_list, output_plaintext_list);
|
||||
|
||||
lwe_ciphertext_list
|
||||
};
|
||||
|
||||
assert_eq!(ser_lwe_ct_list, par_lwe_ct_list);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -212,7 +212,7 @@ fn test_lwe_encrypt_ks_switch_mod_decrypt_custom_mod() {
|
||||
|
||||
// In coverage, we break after one while loop iteration, changing message values does not
|
||||
// yield higher coverage
|
||||
#[cfg(feature = "__coverage")]
|
||||
#[cfg(tarpaulin)]
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,10 +12,18 @@ fn test_seeded_lwe_ksk_gen_equivalence<Scalar: UnsignedTorus + Send + Sync>(
|
||||
// DISCLAIMER: these toy example parameters are not guaranteed to be secure or yield correct
|
||||
// computations
|
||||
// Define parameters for LweKeyswitchKey creation
|
||||
#[cfg(not(tarpaulin))]
|
||||
let input_lwe_dimension = LweDimension(742);
|
||||
#[cfg(tarpaulin)]
|
||||
let input_lwe_dimension = LweDimension(1);
|
||||
|
||||
#[cfg(not(tarpaulin))]
|
||||
let output_lwe_dimension = LweDimension(2048);
|
||||
#[cfg(tarpaulin)]
|
||||
let output_lwe_dimension = LweDimension(32);
|
||||
|
||||
let lwe_noise_distribution =
|
||||
DynamicDistribution::new_gaussian_from_std_dev(StandardDev(0.000007069849454709433));
|
||||
let output_lwe_dimension = LweDimension(2048);
|
||||
let decomp_base_log = DecompositionBaseLog(3);
|
||||
let decomp_level_count = DecompositionLevelCount(5);
|
||||
|
||||
|
||||
@@ -535,11 +535,13 @@ fn std_lwe_encrypt_multi_bit_deterministic_pbs_decrypt_custom_mod<Scalar>(
|
||||
#[test]
|
||||
pub fn test_lwe_encrypt_multi_bit_pbs_decrypt_factor_2_thread_5_native_mod() {
|
||||
lwe_encrypt_multi_bit_pbs_decrypt_custom_mod::<u64>(MULTI_BIT_2_2_2_PARAMS);
|
||||
lwe_encrypt_multi_bit_pbs_decrypt_custom_mod::<u64>(MULTI_BIT_3_3_2_PARAMS);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_lwe_encrypt_multi_bit_pbs_decrypt_factor_3_thread_12_native_mod() {
|
||||
lwe_encrypt_multi_bit_pbs_decrypt_custom_mod::<u64>(MULTI_BIT_2_2_3_PARAMS);
|
||||
lwe_encrypt_multi_bit_pbs_decrypt_custom_mod::<u64>(MULTI_BIT_3_3_3_PARAMS);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -12,9 +12,17 @@ fn test_seeded_lwe_pksk_gen_equivalence<Scalar: UnsignedTorus>(
|
||||
// DISCLAIMER: these toy example parameters are not guaranteed to be secure or yield correct
|
||||
// computations
|
||||
// Define parameters for LweKeyswitchKey creation
|
||||
#[cfg(not(tarpaulin))]
|
||||
let input_lwe_dimension = LweDimension(742);
|
||||
let output_glwe_dimension = GlweDimension(1);
|
||||
#[cfg(tarpaulin)]
|
||||
let input_lwe_dimension = LweDimension(1);
|
||||
|
||||
#[cfg(not(tarpaulin))]
|
||||
let output_polynomial_size = PolynomialSize(2048);
|
||||
#[cfg(tarpaulin)]
|
||||
let output_polynomial_size = PolynomialSize(32);
|
||||
|
||||
let output_glwe_dimension = GlweDimension(1);
|
||||
let glwe_noise_distribution = DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000000000000029403601535432533,
|
||||
));
|
||||
|
||||
@@ -398,6 +398,32 @@ pub const TEST_PARAMS_4_BITS_NATIVE_U128: ClassicTestParams<u128> = ClassicTestP
|
||||
ciphertext_modulus: CiphertextModulus::new_native(),
|
||||
};
|
||||
|
||||
#[cfg(tarpaulin)]
|
||||
pub const COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U128: ClassicTestParams<u128> = ClassicTestParams {
|
||||
lwe_dimension: LweDimension(1),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
polynomial_size: PolynomialSize(64),
|
||||
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
4.9982771e-11,
|
||||
)),
|
||||
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
8.6457178e-32,
|
||||
)),
|
||||
pbs_base_log: DecompositionBaseLog(23),
|
||||
pbs_level: DecompositionLevelCount(1),
|
||||
ks_level: DecompositionLevelCount(5),
|
||||
ks_base_log: DecompositionBaseLog(3),
|
||||
pfks_level: DecompositionLevelCount(1),
|
||||
pfks_base_log: DecompositionBaseLog(23),
|
||||
pfks_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000000000000029403601535432533,
|
||||
)),
|
||||
cbs_level: DecompositionLevelCount(0),
|
||||
cbs_base_log: DecompositionBaseLog(0),
|
||||
message_modulus_log: MessageModulusLog(4),
|
||||
ciphertext_modulus: CiphertextModulus::new_native(),
|
||||
};
|
||||
|
||||
pub const TEST_PARAMS_3_BITS_127_U128: ClassicTestParams<u128> = ClassicTestParams {
|
||||
lwe_dimension: LweDimension(742),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
@@ -423,6 +449,32 @@ pub const TEST_PARAMS_3_BITS_127_U128: ClassicTestParams<u128> = ClassicTestPara
|
||||
ciphertext_modulus: CiphertextModulus::new(1 << 127),
|
||||
};
|
||||
|
||||
#[cfg(tarpaulin)]
|
||||
pub const COVERAGE_TEST_PARAMS_3_BITS_127_U128: ClassicTestParams<u128> = ClassicTestParams {
|
||||
lwe_dimension: LweDimension(1),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
polynomial_size: PolynomialSize(64),
|
||||
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
4.9982771e-11,
|
||||
)),
|
||||
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
8.6457178e-32,
|
||||
)),
|
||||
pbs_base_log: DecompositionBaseLog(23),
|
||||
pbs_level: DecompositionLevelCount(1),
|
||||
ks_level: DecompositionLevelCount(5),
|
||||
ks_base_log: DecompositionBaseLog(3),
|
||||
pfks_level: DecompositionLevelCount(1),
|
||||
pfks_base_log: DecompositionBaseLog(23),
|
||||
pfks_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000000000000029403601535432533,
|
||||
)),
|
||||
cbs_level: DecompositionLevelCount(0),
|
||||
cbs_base_log: DecompositionBaseLog(0),
|
||||
message_modulus_log: MessageModulusLog(3),
|
||||
ciphertext_modulus: CiphertextModulus::new(1 << 127),
|
||||
};
|
||||
|
||||
fn lwe_encrypt_pbs_f128_decrypt_custom_mod<Scalar>(params: ClassicTestParams<Scalar>)
|
||||
where
|
||||
Scalar: UnsignedTorus
|
||||
@@ -537,9 +589,15 @@ where
|
||||
|
||||
#[test]
|
||||
fn lwe_encrypt_pbs_f128_decrypt_custom_mod_test_params_4_bits_native_u128() {
|
||||
#[cfg(not(tarpaulin))]
|
||||
lwe_encrypt_pbs_f128_decrypt_custom_mod(TEST_PARAMS_4_BITS_NATIVE_U128);
|
||||
#[cfg(tarpaulin)]
|
||||
lwe_encrypt_pbs_f128_decrypt_custom_mod(COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U128);
|
||||
}
|
||||
#[test]
|
||||
fn lwe_encrypt_pbs_f128_decrypt_custom_mod_test_params_3_bits_127_u128() {
|
||||
#[cfg(not(tarpaulin))]
|
||||
lwe_encrypt_pbs_f128_decrypt_custom_mod(TEST_PARAMS_3_BITS_127_U128);
|
||||
#[cfg(tarpaulin)]
|
||||
lwe_encrypt_pbs_f128_decrypt_custom_mod(COVERAGE_TEST_PARAMS_3_BITS_127_U128);
|
||||
}
|
||||
|
||||
@@ -24,6 +24,7 @@ mod lwe_packing_keyswitch_key_generation;
|
||||
mod lwe_private_functional_packing_keyswitch;
|
||||
pub(crate) mod lwe_programmable_bootstrapping;
|
||||
mod modulus_switch_compression;
|
||||
#[cfg(not(tarpaulin))]
|
||||
mod noise_distribution;
|
||||
|
||||
pub struct TestResources {
|
||||
@@ -78,6 +79,32 @@ pub const TEST_PARAMS_4_BITS_NATIVE_U64: ClassicTestParams<u64> = ClassicTestPar
|
||||
ciphertext_modulus: CiphertextModulus::new_native(),
|
||||
};
|
||||
|
||||
#[cfg(tarpaulin)]
|
||||
pub const COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U64: ClassicTestParams<u64> = ClassicTestParams {
|
||||
lwe_dimension: LweDimension(1),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
polynomial_size: PolynomialSize(32),
|
||||
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.000007069849454709433,
|
||||
)),
|
||||
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000000000000029403601535432533,
|
||||
)),
|
||||
pbs_base_log: DecompositionBaseLog(23),
|
||||
pbs_level: DecompositionLevelCount(1),
|
||||
ks_level: DecompositionLevelCount(5),
|
||||
ks_base_log: DecompositionBaseLog(3),
|
||||
pfks_level: DecompositionLevelCount(1),
|
||||
pfks_base_log: DecompositionBaseLog(23),
|
||||
pfks_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000000000000029403601535432533,
|
||||
)),
|
||||
cbs_level: DecompositionLevelCount(0),
|
||||
cbs_base_log: DecompositionBaseLog(0),
|
||||
message_modulus_log: MessageModulusLog(4),
|
||||
ciphertext_modulus: CiphertextModulus::new_native(),
|
||||
};
|
||||
|
||||
pub const TEST_PARAMS_3_BITS_63_U64: ClassicTestParams<u64> = ClassicTestParams {
|
||||
lwe_dimension: LweDimension(742),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
@@ -103,6 +130,32 @@ pub const TEST_PARAMS_3_BITS_63_U64: ClassicTestParams<u64> = ClassicTestParams
|
||||
ciphertext_modulus: CiphertextModulus::new(1 << 63),
|
||||
};
|
||||
|
||||
#[cfg(tarpaulin)]
|
||||
pub const COVERAGE_TEST_PARAMS_3_BITS_63_U64: ClassicTestParams<u64> = ClassicTestParams {
|
||||
lwe_dimension: LweDimension(1),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
polynomial_size: PolynomialSize(32),
|
||||
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.000007069849454709433,
|
||||
)),
|
||||
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000000000000029403601535432533,
|
||||
)),
|
||||
pbs_base_log: DecompositionBaseLog(23),
|
||||
pbs_level: DecompositionLevelCount(1),
|
||||
ks_level: DecompositionLevelCount(5),
|
||||
ks_base_log: DecompositionBaseLog(3),
|
||||
pfks_level: DecompositionLevelCount(1),
|
||||
pfks_base_log: DecompositionBaseLog(23),
|
||||
pfks_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000000000000029403601535432533,
|
||||
)),
|
||||
cbs_level: DecompositionLevelCount(0),
|
||||
cbs_base_log: DecompositionBaseLog(0),
|
||||
message_modulus_log: MessageModulusLog(3),
|
||||
ciphertext_modulus: CiphertextModulus::new(1 << 63),
|
||||
};
|
||||
|
||||
pub const TEST_PARAMS_3_BITS_SOLINAS_U64: ClassicTestParams<u64> = ClassicTestParams {
|
||||
lwe_dimension: LweDimension(742),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
@@ -196,6 +249,24 @@ pub const MULTI_BIT_2_2_2_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
|
||||
thread_count: ThreadCount(5),
|
||||
};
|
||||
|
||||
pub const MULTI_BIT_3_3_2_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
|
||||
input_lwe_dimension: LweDimension(922),
|
||||
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.0000003272369292345697,
|
||||
)),
|
||||
decomp_base_log: DecompositionBaseLog(14),
|
||||
decomp_level_count: DecompositionLevelCount(2),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
polynomial_size: PolynomialSize(8192),
|
||||
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.0000000000000000002168404344971009,
|
||||
)),
|
||||
message_modulus_log: MessageModulusLog(6),
|
||||
ciphertext_modulus: CiphertextModulus::new_native(),
|
||||
grouping_factor: LweBskGroupingFactor(2),
|
||||
thread_count: ThreadCount(5),
|
||||
};
|
||||
|
||||
pub const MULTI_BIT_2_2_2_CUSTOM_MOD_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
|
||||
input_lwe_dimension: LweDimension(818),
|
||||
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
@@ -232,6 +303,24 @@ pub const MULTI_BIT_2_2_3_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
|
||||
thread_count: ThreadCount(12),
|
||||
};
|
||||
|
||||
pub const MULTI_BIT_3_3_3_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
|
||||
input_lwe_dimension: LweDimension(972),
|
||||
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000013016688349592805,
|
||||
)),
|
||||
decomp_base_log: DecompositionBaseLog(14),
|
||||
decomp_level_count: DecompositionLevelCount(2),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
polynomial_size: PolynomialSize(8192),
|
||||
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.0000000000000000002168404344971009,
|
||||
)),
|
||||
message_modulus_log: MessageModulusLog(6),
|
||||
ciphertext_modulus: CiphertextModulus::new_native(),
|
||||
grouping_factor: LweBskGroupingFactor(3),
|
||||
thread_count: ThreadCount(5),
|
||||
};
|
||||
|
||||
pub const MULTI_BIT_2_2_3_CUSTOM_MOD_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
|
||||
input_lwe_dimension: LweDimension(888),
|
||||
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
@@ -311,6 +400,70 @@ pub const FFT128_U128_PARAMS: FftTestParams<u128> = FftTestParams {
|
||||
ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
|
||||
};
|
||||
|
||||
// DISCLAIMER: example parameters tailored for coverage tests. There are not guaranteed
|
||||
// to be secure or yield correct computations.
|
||||
#[cfg(tarpaulin)]
|
||||
pub const COVERAGE_FFT_U32_PARAMS: FftTestParams<u32> = FftTestParams {
|
||||
lwe_dimension: LweDimension(1),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
polynomial_size: PolynomialSize(64),
|
||||
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000000004998277131225527,
|
||||
)),
|
||||
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000000000000000000000000000008645717832544903,
|
||||
)),
|
||||
pbs_base_log: DecompositionBaseLog(23),
|
||||
pbs_level: DecompositionLevelCount(1),
|
||||
ciphertext_modulus: CiphertextModulus::new_native(),
|
||||
};
|
||||
|
||||
#[cfg(tarpaulin)]
|
||||
pub const COVERAGE_FFT_U64_PARAMS: FftTestParams<u64> = FftTestParams {
|
||||
lwe_dimension: LweDimension(1),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
polynomial_size: PolynomialSize(64),
|
||||
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000000004998277131225527,
|
||||
)),
|
||||
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000000000000000000000000000008645717832544903,
|
||||
)),
|
||||
pbs_base_log: DecompositionBaseLog(23),
|
||||
pbs_level: DecompositionLevelCount(1),
|
||||
ciphertext_modulus: CiphertextModulus::new_native(),
|
||||
};
|
||||
|
||||
#[cfg(tarpaulin)]
|
||||
pub const COVERAGE_FFT_U128_PARAMS: FftTestParams<u128> = FftTestParams {
|
||||
lwe_dimension: LweDimension(1),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
polynomial_size: PolynomialSize(64),
|
||||
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000000004998277131225527,
|
||||
)),
|
||||
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000000000000000000000000000008645717832544903,
|
||||
)),
|
||||
pbs_base_log: DecompositionBaseLog(23),
|
||||
pbs_level: DecompositionLevelCount(1),
|
||||
ciphertext_modulus: CiphertextModulus::new_native(),
|
||||
};
|
||||
|
||||
#[cfg(tarpaulin)]
|
||||
pub const COVERAGE_FFT128_U128_PARAMS: FftTestParams<u128> = FftTestParams {
|
||||
lwe_dimension: LweDimension(1),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
polynomial_size: PolynomialSize(64),
|
||||
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(0.12345)),
|
||||
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
0.00000000000000000000000000000008645717832544903,
|
||||
)),
|
||||
pbs_base_log: DecompositionBaseLog(23),
|
||||
pbs_level: DecompositionLevelCount(1),
|
||||
ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
|
||||
};
|
||||
|
||||
pub const FFT_WOPBS_PARAMS: FftWopPbsTestParams<u64> = FftWopPbsTestParams {
|
||||
lwe_dimension: LweDimension(481),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
@@ -480,21 +633,33 @@ pub(crate) fn gen_keys_or_get_from_cache_if_enabled<
|
||||
|
||||
// Macro to generate tests for all parameter sets
|
||||
macro_rules! create_parametrized_test{
|
||||
($name:ident { $($param:ident),* $(,)? }) => {
|
||||
(
|
||||
$name:ident {
|
||||
$($(#[$cfg:meta])* $param:ident),*
|
||||
$(,)?
|
||||
}
|
||||
) => {
|
||||
::paste::paste! {
|
||||
$(
|
||||
#[test]
|
||||
fn [<test_ $name _ $param:lower>]() {
|
||||
$name($param)
|
||||
}
|
||||
#[test]
|
||||
$(#[$cfg])*
|
||||
fn [<test_ $name _ $param:lower>]() {
|
||||
$name($param)
|
||||
}
|
||||
)*
|
||||
}
|
||||
};
|
||||
($name:ident)=> {
|
||||
create_parametrized_test!($name
|
||||
{
|
||||
#[cfg(not(tarpaulin))]
|
||||
TEST_PARAMS_4_BITS_NATIVE_U64,
|
||||
TEST_PARAMS_3_BITS_63_U64
|
||||
#[cfg(not(tarpaulin))]
|
||||
TEST_PARAMS_3_BITS_63_U64,
|
||||
#[cfg(tarpaulin)]
|
||||
COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U64,
|
||||
#[cfg(tarpaulin)]
|
||||
COVERAGE_TEST_PARAMS_3_BITS_63_U64
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@ pub struct ClassicBootstrapKeys<Scalar: UnsignedInteger> {
|
||||
pub small_lwe_sk: LweSecretKey<Vec<Scalar>>,
|
||||
pub big_lwe_sk: LweSecretKey<Vec<Scalar>>,
|
||||
pub bsk: LweBootstrapKeyOwned<Scalar>,
|
||||
|
||||
pub fbsk: FourierLweBootstrapKeyOwned,
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
pub(crate) mod mask_random_generator;
|
||||
pub(crate) mod noise_random_generator;
|
||||
#[cfg(not(tarpaulin))]
|
||||
#[cfg(test)]
|
||||
mod test;
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user