Compare commits

...

54 Commits

Author SHA1 Message Date
David Testé
146aaf662a chore(ci): spawn ec2 instance on-demand for code-coverage 2024-05-02 15:41:33 +02:00
David Testé
c28c8beb69 chore(ci): speed-up core_crypto code coverage execution duration 2024-05-02 15:40:09 +02:00
Nicolas Sarlin
08fd31f8fc feat(gpu): add compatibility with cuda from package managers 2024-05-02 10:14:22 +02:00
Nicolas Sarlin
ba18519946 chore(ci): update toolchain to latest nightly 2024-04-30 16:50:18 +02:00
Beka Barbakadze
ab8a51bc0d feat(gpu): implement div_rem for cuda GPU 2024-04-30 15:53:02 +02:00
Arthur Meyre
9ecb9f416e chore(doc): fix broken reference 2024-04-30 11:25:06 +02:00
David Testé
0618b9422c chore(ci): separate pcc and tests for gpu to allow faster merge 2024-04-30 08:51:46 +02:00
David Testé
5bb3af30b6 chore(bench): fix naming format for signed scalar operations 2024-04-30 08:51:29 +02:00
aquint-zama
ef1607db51 chore(doc): fix broken link 2024-04-29 14:35:18 +02:00
Arthur Meyre
d2daf449b7 chore(core): fix the stair KS test for more generic stair KS parameters
- the actual large LWE dimension to consider is the partial fill as those
are the non zero coefficients on which the shared coefficients are then
computed/based on
- the 4 bits parameters don't exhibit the bug as they are not on the GLWE
noise plateau
2024-04-29 13:23:46 +02:00
dependabot[bot]
826c80ea6d chore(deps): bump actions/upload-artifact from 4.3.2 to 4.3.3
Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.3.2 to 4.3.3.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](1746f4ab65...65462800fd)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-04-29 13:23:12 +02:00
dependabot[bot]
bc991e1946 chore(deps): bump actions/checkout from 4.1.3 to 4.1.4
Bumps [actions/checkout](https://github.com/actions/checkout) from 4.1.3 to 4.1.4.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](1d96c772d1...0ad4b8fada)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-04-29 13:23:06 +02:00
Pedro Alves
f88e309503 feat(gpu): Implements a classical PBS variant that uses thread block cluster and distributed shared memory 2024-04-29 09:42:58 +02:00
Pedro Alves
cbc23c08a2 feat(gpu): Implements a multi-bit PBS variant that uses thread block
cluster and distributed shared memory
2024-04-29 09:42:58 +02:00
Pedro Alves
e9ee813911 fix(gpu): fix out-of-memory error in the custom benchmark tool 2024-04-29 09:42:58 +02:00
Pedro Alves
3ebc114c85 fix(gpu): implements a NOSM mode to tree_add_chunks() 2024-04-29 09:42:58 +02:00
Pedro Alves
effe6a9ec8 chore(gpu): remove message_3_carry_3 parameters from scalar/encrypted comparison tests 2024-04-29 09:42:58 +02:00
Pedro Alves
a8fe4d9f38 fix(gpu): fix multi-bit PBS when pbs_level > 1 2024-04-29 09:42:58 +02:00
Pedro Alves
af7c1e14a4 fix(gpu): refactor PBS' host functions to run using the same variant as the scratch function 2024-04-29 09:42:58 +02:00
Agnes Leroy
85fbca62a2 chore(gpu): extend integer multiplication to N other than 2048 2024-04-29 09:42:58 +02:00
Agnes Leroy
9c36ee477b chore(gpu): fix params in GPU integer benchmarks 2024-04-26 13:00:01 -03:00
Pedro Alves
9d70e695b5 chore(gpu): automatically compute the best lwe_chunk_size 2024-04-26 11:58:19 -03:00
Pedro Alves
33e7cfea2d chore(gpu): move luts_message_carry allocation to host_integer_sum_ciphertexts_vec_kb to reduce memory consumption 2024-04-26 09:20:39 -03:00
Daniel Demmler
ce0aea171b fix(zk): correct byte indices for uncompressed serializtion 2024-04-25 18:06:54 +02:00
Kelong Cong
083c068144 fix(zk): expose compute_crs_params since we need big_d and b_r too
Knowing these two values in addition would allow us to use
PublicParams::from_vec.
2024-04-25 16:45:11 +02:00
Kelong Cong
8912e887e3 feat(zk): open up the API to output the CRS witness dimension
The motivation of this PR is that sometimes the CRS is generated by a ceremony,
instead of centrally.  So it is useful to have some function that computes the
CRS witness dimension without generating the CRS centrally.
2024-04-25 16:45:11 +02:00
tmontaigu
ddd0f2ce79 chore(integer): reduce nb tests for big params 2024-04-25 13:53:58 +02:00
tmontaigu
aeab9e851b fix(integer): fix match_value 2024-04-25 13:53:58 +02:00
Mayeul@Zama
919f07f911 fix(shortint): is_functional_bivariate_pbs_possible takes an optional lut 2024-04-24 18:55:37 +02:00
tmontaigu
6199e04ce0 feat(integer): add vector match_value/index_of/contains/is_in 2024-04-23 10:55:36 +02:00
dependabot[bot]
77a6673d59 chore(deps): bump dtolnay/rust-toolchain
Bumps [dtolnay/rust-toolchain](https://github.com/dtolnay/rust-toolchain) from 7164405e8653277d57afd42ba081b5aa02a70396 to bb45937a053e097f8591208d8e74c90db1873d07.
- [Release notes](https://github.com/dtolnay/rust-toolchain/releases)
- [Commits](7164405e86...bb45937a05)

---
updated-dependencies:
- dependency-name: dtolnay/rust-toolchain
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-04-22 13:01:57 +02:00
dependabot[bot]
631dd09445 chore(deps): bump tj-actions/changed-files from 44.0.1 to 44.3.0
Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 44.0.1 to 44.3.0.
- [Release notes](https://github.com/tj-actions/changed-files/releases)
- [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md)
- [Commits](635f118699...0874344d6e)

---
updated-dependencies:
- dependency-name: tj-actions/changed-files
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-04-22 09:20:46 +02:00
dependabot[bot]
a7f6ea12ce chore(deps): bump actions/upload-artifact from 4.3.1 to 4.3.2
Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.3.1 to 4.3.2.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](5d5d22a312...1746f4ab65)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-04-22 09:20:21 +02:00
dependabot[bot]
0a1cba43c4 chore(deps): bump actions/checkout from 4.1.2 to 4.1.3
Bumps [actions/checkout](https://github.com/actions/checkout) from 4.1.2 to 4.1.3.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](9bb56186c3...1d96c772d1)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-04-22 09:20:12 +02:00
Agnes Leroy
666926e4b0 chore(gpu): add tests for N=8192 multi-bit PBS 2024-04-19 17:02:55 +02:00
Agnes Leroy
105e2488f6 chore(gpu): add casts and signed scalar comparisons to the hl api 2024-04-18 18:20:24 +02:00
Agnes Leroy
cf62937632 feat(gpu): cast signed/unsigned and signed/signed 2024-04-18 09:11:53 +02:00
Arthur Meyre
c0985707ad chore(doc): fix API calls 2024-04-15 18:56:00 +02:00
Arthur Meyre
2cc20a6bd5 chore(ci): fix sage script f-string missing f 2024-04-15 18:55:06 +02:00
Agnes Leroy
28db75b3a3 fix(gpu): fix typo in doc 2024-04-15 17:06:49 +02:00
dependabot[bot]
6f31bffef6 chore(deps): bump codecov/codecov-action from 4.2.0 to 4.3.0
Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 4.2.0 to 4.3.0.
- [Release notes](https://github.com/codecov/codecov-action/releases)
- [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md)
- [Commits](7afa10ed9b...84508663e9)

---
updated-dependencies:
- dependency-name: codecov/codecov-action
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-04-15 13:15:10 +02:00
dependabot[bot]
9e6b251e68 chore(deps): bump dtolnay/rust-toolchain
Bumps [dtolnay/rust-toolchain](https://github.com/dtolnay/rust-toolchain) from dc6353516c68da0f06325f42ad880f76a5e77ec9 to 7164405e8653277d57afd42ba081b5aa02a70396.
- [Release notes](https://github.com/dtolnay/rust-toolchain/releases)
- [Commits](dc6353516c...7164405e86)

---
updated-dependencies:
- dependency-name: dtolnay/rust-toolchain
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-04-15 13:14:42 +02:00
dependabot[bot]
da46d26f1b chore(deps): bump tj-actions/changed-files from 44.0.0 to 44.0.1
Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 44.0.0 to 44.0.1.
- [Release notes](https://github.com/tj-actions/changed-files/releases)
- [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md)
- [Commits](2d756ea4c5...635f118699)

---
updated-dependencies:
- dependency-name: tj-actions/changed-files
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-04-15 12:24:24 +02:00
Mayeul@Zama
b4662607bf style(core): remove newline 2024-04-15 09:29:05 +02:00
Mayeul@Zama
faac5e10e6 style(c_api): fix typo 2024-04-15 09:29:05 +02:00
Mayeul@Zama
0fa5231e5a style(core): remove allow(clippy::used_underscore_binding) 2024-04-15 09:29:05 +02:00
Arthur Meyre
9f36cf7120 chore(ci): make a github runner remove a label 2024-04-12 09:54:45 +02:00
Arthur Meyre
a654a47136 chore(c): change include <tfhe.h> to "tfhe.h"
- as tfhe is not a system library C/C++ practice is to use the "" style
2024-04-12 09:54:37 +02:00
Arthur Meyre
29dfac320c chore(c_api): fix build on M1 mac 2024-04-11 09:23:40 +02:00
Arthur Meyre
af11862dc1 chore(ci): add zk feature for deterministic FFT C API 2024-04-11 09:23:40 +02:00
Arthur Meyre
1b5745c22f chore(ci): add missing install for clippy_task 2024-04-11 09:23:40 +02:00
Arthur Meyre
e6970fc00d chore(tfhe): update serialization version 2024-04-11 09:23:20 +02:00
Arthur Meyre
cc4b7dc33e refactor(shortint): add max_noise_level field to CompressedServerKey 2024-04-11 09:23:20 +02:00
Agnes Leroy
5417ba53be fix(gpu): add edge test for scalar comparisons 2024-04-10 06:43:17 -03:00
171 changed files with 10494 additions and 1619 deletions

View File

@@ -45,14 +45,14 @@ jobs:
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: stable

View File

@@ -29,10 +29,10 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
- name: Install latest stable
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: stable

View File

@@ -36,8 +36,8 @@ jobs:
job-secret: ${{ secrets.JOB_SECRET }}
profile: gpu-test
cuda-tests-linux:
name: CUDA tests
cuda-pcc:
name: CUDA post-commit checks
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
@@ -56,14 +56,14 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: stable
@@ -94,6 +94,64 @@ jobs:
run: |
make pcc_gpu
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
cuda-tests-linux:
name: CUDA tests
needs: [ setup-ec2, cuda-pcc ]
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1d96c772d19495a3b5c517cd2bc0cb401ea0529f
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Run core crypto, integer and internal CUDA backend tests
run: |
make test_gpu
@@ -121,7 +179,7 @@ jobs:
teardown-ec2:
name: Teardown EC2 instance (cuda-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, cuda-tests-linux ]
needs: [ setup-ec2, cuda-pcc, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance

View File

@@ -46,14 +46,14 @@ jobs:
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: stable

View File

@@ -46,14 +46,14 @@ jobs:
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: stable

View File

@@ -46,14 +46,14 @@ jobs:
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: stable

View File

@@ -46,14 +46,14 @@ jobs:
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: stable

View File

@@ -53,7 +53,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -63,7 +63,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
@@ -97,13 +97,13 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_boolean
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab

View File

@@ -23,7 +23,7 @@ jobs:
fail-fast: false
steps:
- uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
- uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
- name: Install and run newline linter checks
if: matrix.os == 'ubuntu-latest'

View File

@@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
- name: Get actionlint
run: |

View File

@@ -6,70 +6,58 @@ env:
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
# All the inputs are provided by Slab
inputs:
instance_id:
description: "AWS instance ID"
type: string
instance_image_id:
description: "AWS instance AMI ID"
type: string
instance_type:
description: "AWS instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: 'Slab request ID'
type: string
fork_repo:
description: 'Name of forked repo as user/repo'
type: string
fork_git_sha:
description: 'Git SHA to checkout from fork'
type: string
jobs:
setup-ec2:
name: Setup EC2 instance (code-coverage)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
aws-region: ${{ steps.start-instance.outputs.aws-region }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
profile: cpu-small
code-coverage:
name: Code coverage
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ inputs.runner_name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
timeout-minutes: 11520 # 8 days
steps:
# Step used for log purpose.
- name: Instance configuration used
run: |
echo "ID: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
echo "Fork repo: ${{ inputs.fork_repo }}"
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: ${{ inputs.fork_repo }}
ref: ${{ inputs.fork_git_sha }}
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: stable
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@2d756ea4c53f7f6b397767d8723b3a10a9f35bf2
uses: tj-actions/changed-files@0874344d6ebbaa00a27da73276ae7162fadcaf69
with:
files_yaml: |
tfhe:
@@ -78,29 +66,29 @@ jobs:
- concrete-csprng/src/**
- name: Generate Keys
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
# if: steps.changed-files.outputs.tfhe_any_changed == 'true'
run: |
make GEN_KEY_CACHE_COVERAGE_ONLY=TRUE gen_key_cache
make gen_key_cache_core_crypto
- name: Run coverage for core_crypto
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
# if: steps.changed-files.outputs.tfhe_any_changed == 'true'
run: |
make test_core_crypto_cov AVX512_SUPPORT=ON
- name: Run coverage for boolean
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
# if: steps.changed-files.outputs.tfhe_any_changed == 'true'
run: |
make test_boolean_cov
- name: Run coverage for shortint
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
# if: steps.changed-files.outputs.tfhe_any_changed == 'true'
run: |
make test_shortint_cov
- name: Upload tfhe coverage to Codecov
uses: codecov/codecov-action@7afa10ed9b269c561c2336fd862446844e0cbf71
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
uses: codecov/codecov-action@84508663e988701840491b86de86b666e8a86bed
# if: steps.changed-files.outputs.tfhe_any_changed == 'true'
with:
token: ${{ secrets.CODECOV_TOKEN }}
directory: ./coverage/
@@ -108,13 +96,13 @@ jobs:
files: shortint/cobertura.xml,boolean/cobertura.xml,core_crypto/cobertura.xml,core_crypto_avx512/cobertura.xml
- name: Run integer coverage
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
# if: steps.changed-files.outputs.tfhe_any_changed == 'true'
run: |
make test_integer_cov
- name: Upload tfhe coverage to Codecov
uses: codecov/codecov-action@7afa10ed9b269c561c2336fd862446844e0cbf71
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
uses: codecov/codecov-action@84508663e988701840491b86de86b666e8a86bed
# if: steps.changed-files.outputs.tfhe_any_changed == 'true'
with:
token: ${{ secrets.CODECOV_TOKEN }}
directory: ./coverage/
@@ -127,8 +115,29 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Code coverage finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
teardown-ec2:
name: Teardown EC2 instance (code-coverage)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, code-coverage ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
region: ${{ needs.setup-ec2.outputs.aws-region }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "EC2 teardown (code-coverage) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -53,7 +53,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -63,7 +63,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
@@ -88,13 +88,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab

View File

@@ -61,7 +61,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -71,7 +71,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
@@ -118,13 +118,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab

View File

@@ -47,14 +47,14 @@ jobs:
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: stable

View File

@@ -39,7 +39,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -52,12 +52,12 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab
@@ -81,7 +81,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
@@ -120,7 +120,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -133,12 +133,12 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab
@@ -163,7 +163,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
@@ -194,7 +194,7 @@ jobs:
name: Remove 4090 bench label
if: ${{ always() && github.event_name == 'pull_request' }}
needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
runs-on: ["self-hosted", "4090-desktop"]
runs-on: ubuntu-latest
steps:
- uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
with:

View File

@@ -46,7 +46,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -56,7 +56,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
@@ -70,7 +70,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab

View File

@@ -74,7 +74,7 @@ jobs:
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -92,12 +92,12 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab
@@ -121,7 +121,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -56,7 +56,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -66,7 +66,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
@@ -100,7 +100,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -122,13 +122,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab

View File

@@ -64,7 +64,7 @@ jobs:
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -82,7 +82,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
@@ -107,7 +107,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab
@@ -132,7 +132,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -46,7 +46,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -56,7 +56,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
@@ -70,7 +70,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab

View File

@@ -57,7 +57,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -67,7 +67,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
@@ -101,7 +101,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -123,13 +123,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab

View File

@@ -31,10 +31,10 @@ jobs:
timeout-minutes: 720
steps:
- uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
- uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
- name: Install latest stable
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: stable
@@ -86,6 +86,13 @@ jobs:
run: |
make test_boolean
# Because we do "illegal" things with the build system which Cargo does not seem to like much
# we need to clear the cache to make sure the C API is built properly and does not use a stale
# cached version
- name: Clear build cache
run: |
cargo clean
- name: Run C API tests
run: |
make test_c_api

View File

@@ -30,7 +30,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0

View File

@@ -18,7 +18,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0

View File

@@ -55,7 +55,7 @@ jobs:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -64,7 +64,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: stable

View File

@@ -17,10 +17,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
- name: Checkout lattice-estimator
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: malb/lattice-estimator
path: lattice_estimator

View File

@@ -45,7 +45,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -55,7 +55,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
@@ -89,13 +89,13 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_shortint
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab

View File

@@ -53,7 +53,7 @@ jobs:
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -71,12 +71,12 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab
@@ -115,7 +115,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -46,7 +46,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -56,7 +56,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
@@ -70,7 +70,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab

View File

@@ -52,7 +52,7 @@ jobs:
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -70,12 +70,12 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab
@@ -99,7 +99,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -46,7 +46,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -56,7 +56,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
@@ -70,7 +70,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab

View File

@@ -58,13 +58,13 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@2d756ea4c53f7f6b397767d8723b3a10a9f35bf2
uses: tj-actions/changed-files@0874344d6ebbaa00a27da73276ae7162fadcaf69
with:
files_yaml: |
common_benches:
@@ -111,7 +111,7 @@ jobs:
- .github/workflows/wasm_client_benchmark.yml
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab

View File

@@ -30,12 +30,12 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab

View File

@@ -13,11 +13,11 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
- name: Save repo
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: repo-archive
path: '.'

View File

@@ -53,7 +53,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
fetch-depth: 0
@@ -63,7 +63,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
with:
toolchain: nightly
@@ -98,13 +98,13 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_wasm
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
with:
repository: zama-ai/slab
path: slab

View File

@@ -271,7 +271,7 @@ clippy_js_wasm_api: install_rs_check_toolchain
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
clippy_tasks:
clippy_tasks: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-p tasks -- --no-deps -D warnings
@@ -281,19 +281,19 @@ clippy_trivium: install_rs_check_toolchain
-p tfhe-trivium -- --no-deps -D warnings
.PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
clippy_all_targets:
clippy_all_targets: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
clippy_concrete_csprng:
clippy_concrete_csprng: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=$(TARGET_ARCH_FEATURE) \
-p concrete-csprng -- --no-deps -D warnings
.PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
clippy_zk_pok:
clippy_zk_pok: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-zk-pok -- --no-deps -D warnings
@@ -376,7 +376,7 @@ build_c_api_gpu: install_rs_check_toolchain
.PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
-p $(TFHE_SPEC)
@"$(MAKE)" symlink_c_libs_without_fingerprint
@@ -424,13 +424,6 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache \
-p $(TFHE_SPEC) -- core_crypto::
@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,$(AVX512_FEATURE) \
-p $(TFHE_SPEC) -- -Z unstable-options --report-time core_crypto::; \
fi
.PHONY: test_cuda_backend # Run the internal tests of the CUDA backend
test_cuda_backend:
@@ -444,14 +437,14 @@ test_cuda_backend:
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
test_core_crypto_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
.PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
test_integer_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
@@ -479,14 +472,14 @@ test_c_api_rs: install_rs_check_toolchain
.PHONY: test_c_api_c # Run the C tests for the C API
test_c_api_c: build_c_api
./scripts/c_api_tests.sh
./scripts/c_api_tests.sh --cargo-profile "$(CARGO_PROFILE)"
.PHONY: test_c_api # Run all the tests for the C API
test_c_api: test_c_api_rs test_c_api_c
.PHONY: test_c_api_gpu # Run the C tests for the C API
test_c_api_gpu: build_c_api_gpu
./scripts/c_api_tests.sh --gpu
./scripts/c_api_tests.sh --gpu --cargo-profile "$(CARGO_PROFILE)"
.PHONY: test_shortint_ci # Run the tests for shortint ci
test_shortint_ci: install_rs_build_toolchain install_cargo_nextest
@@ -638,12 +631,12 @@ test_kreyvium: install_rs_build_toolchain
-p tfhe-trivium -- --test-threads=1 kreyvium::
.PHONY: test_concrete_csprng # Run concrete-csprng tests
test_concrete_csprng:
test_concrete_csprng: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng
.PHONY: test_zk_pok # Run tfhe-zk-pok-experimental tests
test_zk_pok:
test_zk_pok: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-p tfhe-zk-pok
@@ -683,14 +676,14 @@ check_md_docs_are_tested:
RUSTFLAGS="" cargo xtask check_tfhe_docs_are_tested
.PHONY: check_compile_tests # Build tests in debug without running them
check_compile_tests:
check_compile_tests: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache \
-p $(TFHE_SPEC)
@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
"$(MAKE)" build_c_api && \
./scripts/c_api_tests.sh --build-only; \
./scripts/c_api_tests.sh --build-only --cargo-profile "$(CARGO_PROFILE)"; \
fi
.PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
@@ -813,8 +806,6 @@ bench_oprf: install_rs_check_toolchain
--bench oprf-integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
.PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
bench_shortint_multi_bit: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
@@ -823,7 +814,6 @@ bench_shortint_multi_bit: install_rs_check_toolchain
--bench shortint-bench \
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
.PHONY: bench_boolean # Run benchmarks for boolean
bench_boolean: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
@@ -867,6 +857,7 @@ ci_bench_web_js_api_parallel: build_web_js_api_parallel
#
# Utility tools
#
.PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
gen_key_cache: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \

View File

@@ -71,7 +71,7 @@ fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
}
fn main() {
let config = ConfigBuilder::all_disabled().enable_default_bool().build();
let config = ConfigBuilder::default().build();
let (client_key, server_key) = generate_keys(config);
let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -143,7 +143,7 @@ use tfhe::prelude::*;
use tfhe_trivium::TriviumStreamShortint;
fn test_shortint() {
let config = ConfigBuilder::all_disabled().enable_default_integers().build();
let config = ConfigBuilder::default().build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
let ksk = CastingKey::new((&client_key, &server_key), (&hl_client_key, &hl_server_key));

View File

@@ -13,6 +13,7 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
[build-dependencies]
cmake = { version = "0.1" }
pkg-config = { version = "0.3" }
[dependencies]
thiserror = "1.0"

View File

@@ -21,7 +21,15 @@ fn main() {
let dest = cmake::build("cuda");
println!("cargo:rustc-link-search=native={}", dest.display());
println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
// Try to find the cuda libs with pkg-config, default to the path used by the nvidia runfile
if pkg_config::Config::new()
.atleast_version("10")
.probe("cuda")
.is_err()
{
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
}
println!("cargo:rustc-link-lib=gomp");
println!("cargo:rustc-link-lib=cudart");
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");

View File

@@ -8,7 +8,6 @@
#include <cuda_runtime.h>
#define synchronize_threads_in_block() __syncthreads()
extern "C" {
#define check_cuda_error(ans) \
@@ -57,6 +56,8 @@ void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
bool cuda_check_support_cooperative_groups();
bool cuda_check_support_thread_block_clusters();
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream);

View File

@@ -39,6 +39,22 @@ enum COMPARISON_TYPE {
enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
extern "C" {
void scratch_cuda_apply_univariate_lut_kb_64(
cuda_stream_t *stream, int8_t **mem_ptr, void *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_apply_univariate_lut_kb_64(cuda_stream_t *stream,
void *output_radix_lwe,
void *input_radix_lwe, int8_t *mem_ptr,
void *ksk, void *bsk, uint32_t num_blocks);
void cleanup_cuda_apply_univariate_lut_kb_64(cuda_stream_t *stream,
int8_t **mem_ptr_void);
void scratch_cuda_full_propagation_64(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -277,7 +293,23 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
void cleanup_cuda_integer_radix_scalar_mul(cuda_stream_t *stream,
int8_t **mem_ptr_void);
}
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_integer_div_rem_radix_ciphertext_kb_64(
cuda_stream_t *stream, void *quotient, void *remainder, void *numerator,
void *divisor, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t num_blocks_in_radix);
void cleanup_cuda_integer_div_rem(cuda_stream_t *stream, int8_t **mem_ptr_void);
} // extern C
template <typename Torus>
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
@@ -304,6 +336,11 @@ void generate_device_accumulator_bivariate(
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f);
template <typename Torus>
void generate_device_accumulator_bivariate_with_factor(
cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f, int factor);
/*
* generate univariate accumulator (lut) for device pointer
* v_stream - cuda stream
@@ -966,7 +1003,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
Torus *old_blocks;
Torus *small_lwe_vector;
int_radix_params params;
int_radix_lut<Torus> *luts_message_carry;
int_sc_prop_memory<Torus> *scp_mem;
int32_t *d_smart_copy_in;
@@ -979,10 +1015,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
uint32_t max_num_radix_in_vec,
bool allocate_gpu_memory) {
this->params = params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
// create single carry propagation memory object
scp_mem = new int_sc_prop_memory<Torus>(stream, params, num_blocks_in_radix,
@@ -1002,43 +1034,15 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
(int32_t *)cuda_malloc_async(max_pbs_count * sizeof(int32_t), stream);
d_smart_copy_out =
(int32_t *)cuda_malloc_async(max_pbs_count * sizeof(int32_t), stream);
// create lut object for message and carry
luts_message_carry = new int_radix_lut<Torus>(
stream, params, 2, max_pbs_count, allocate_gpu_memory);
auto message_acc = luts_message_carry->get_lut(0);
auto carry_acc = luts_message_carry->get_lut(1);
// define functions for each accumulator
auto lut_f_message = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
return x / message_modulus;
};
// generate accumulators
generate_device_accumulator<Torus>(stream, message_acc, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, lut_f_message);
generate_device_accumulator<Torus>(stream, carry_acc, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, lut_f_carry);
}
int_sum_ciphertexts_vec_memory(cuda_stream_t *stream, int_radix_params params,
uint32_t num_blocks_in_radix,
uint32_t max_num_radix_in_vec,
Torus *new_blocks, Torus *old_blocks,
Torus *small_lwe_vector,
int_radix_lut<Torus> *base_lut_object) {
Torus *small_lwe_vector) {
mem_reuse = true;
this->params = params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
// create single carry propagation memory object
scp_mem = new int_sc_prop_memory<Torus>(stream, params, num_blocks_in_radix,
@@ -1054,29 +1058,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
(int32_t *)cuda_malloc_async(max_pbs_count * sizeof(int32_t), stream);
d_smart_copy_out =
(int32_t *)cuda_malloc_async(max_pbs_count * sizeof(int32_t), stream);
// create lut object for message and carry
luts_message_carry = new int_radix_lut<Torus>(
stream, params, 2, max_pbs_count, base_lut_object);
auto message_acc = luts_message_carry->get_lut(0);
auto carry_acc = luts_message_carry->get_lut(1);
// define functions for each accumulator
auto lut_f_message = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
return x / message_modulus;
};
// generate accumulators
generate_device_accumulator<Torus>(stream, message_acc, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, lut_f_message);
generate_device_accumulator<Torus>(stream, carry_acc, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, lut_f_carry);
}
void release(cuda_stream_t *stream) {
@@ -1090,10 +1071,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
}
scp_mem->release(stream);
luts_message_carry->release(stream);
delete scp_mem;
delete luts_message_carry;
}
};
@@ -1220,7 +1199,7 @@ template <typename Torus> struct int_mul_memory {
// create memory object for sum ciphertexts
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
stream, params, num_radix_blocks, 2 * num_radix_blocks, block_mul_res,
vector_result_sb, small_lwe_vector, luts_array);
vector_result_sb, small_lwe_vector);
}
void release(cuda_stream_t *stream) {
@@ -2156,6 +2135,327 @@ template <typename Torus> struct int_comparison_buffer {
}
};
template <typename Torus> struct int_div_rem_memory {
int_radix_params params;
bool mem_reuse = false;
// memory objects for other operations
int_logical_scalar_shift_buffer<Torus> *shift_mem_1;
int_logical_scalar_shift_buffer<Torus> *shift_mem_2;
int_overflowing_sub_memory<Torus> *overflow_sub_mem;
int_comparison_buffer<Torus> *comparison_buffer;
// lookup tables
int_radix_lut<Torus> **masking_luts_1;
int_radix_lut<Torus> **masking_luts_2;
int_radix_lut<Torus> *message_extract_lut_1;
int_radix_lut<Torus> *message_extract_lut_2;
int_radix_lut<Torus> **zero_out_if_overflow_did_not_happen;
int_radix_lut<Torus> **zero_out_if_overflow_happened;
int_radix_lut<Torus> **merge_overflow_flags_luts;
// sub streams
cuda_stream_t *sub_stream_1;
cuda_stream_t *sub_stream_2;
cuda_stream_t *sub_stream_3;
cuda_stream_t *sub_stream_4;
// temporary device buffers
Torus *remainder1;
Torus *remainder2;
Torus *numerator_block_stack;
Torus *numerator_block_1;
Torus *tmp_radix;
Torus *interesting_remainder1;
Torus *interesting_remainder2;
Torus *interesting_divisor;
Torus *divisor_ms_blocks;
Torus *new_remainder;
Torus *subtraction_overflowed;
Torus *did_not_overflow;
Torus *overflow_sum;
Torus *overflow_sum_radix;
Torus *tmp_1;
Torus *at_least_one_upper_block_is_non_zero;
Torus *cleaned_merged_interesting_remainder;
// allocate and initialize if needed, temporary arrays used to calculate
// cuda integer div_rem operation
void init_temporary_buffers(cuda_stream_t *stream, uint32_t num_blocks) {
uint32_t big_lwe_size = params.big_lwe_dimension + 1;
// non boolean temporary arrays, with `num_blocks` blocks
remainder1 = (Torus *)cuda_malloc_async(
big_lwe_size * num_blocks * sizeof(Torus), stream);
remainder2 = (Torus *)cuda_malloc_async(
big_lwe_size * num_blocks * sizeof(Torus), stream);
numerator_block_stack = (Torus *)cuda_malloc_async(
big_lwe_size * num_blocks * sizeof(Torus), stream);
interesting_remainder2 = (Torus *)cuda_malloc_async(
big_lwe_size * num_blocks * sizeof(Torus), stream);
interesting_divisor = (Torus *)cuda_malloc_async(
big_lwe_size * num_blocks * sizeof(Torus), stream);
divisor_ms_blocks = (Torus *)cuda_malloc_async(
big_lwe_size * num_blocks * sizeof(Torus), stream);
new_remainder = (Torus *)cuda_malloc_async(
big_lwe_size * num_blocks * sizeof(Torus), stream);
cleaned_merged_interesting_remainder = (Torus *)cuda_malloc_async(
big_lwe_size * num_blocks * sizeof(Torus), stream);
tmp_1 = (Torus *)cuda_malloc_async(
big_lwe_size * num_blocks * sizeof(Torus), stream);
// temporary arrays used as stacks
tmp_radix = (Torus *)cuda_malloc_async(
big_lwe_size * (num_blocks + 1) * sizeof(Torus), stream);
interesting_remainder1 = (Torus *)cuda_malloc_async(
big_lwe_size * (num_blocks + 1) * sizeof(Torus), stream);
numerator_block_1 =
(Torus *)cuda_malloc_async(big_lwe_size * 2 * sizeof(Torus), stream);
// temporary arrays for boolean blocks
subtraction_overflowed =
(Torus *)cuda_malloc_async(big_lwe_size * 1 * sizeof(Torus), stream);
did_not_overflow =
(Torus *)cuda_malloc_async(big_lwe_size * 1 * sizeof(Torus), stream);
overflow_sum =
(Torus *)cuda_malloc_async(big_lwe_size * 1 * sizeof(Torus), stream);
overflow_sum_radix = (Torus *)cuda_malloc_async(
big_lwe_size * num_blocks * sizeof(Torus), stream);
at_least_one_upper_block_is_non_zero =
(Torus *)cuda_malloc_async(big_lwe_size * 1 * sizeof(Torus), stream);
}
// initialize lookup tables for div_rem operation
void init_lookup_tables(cuda_stream_t *stream, uint32_t num_blocks) {
uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);
// create and generate masking_luts_1[] and masking_lut_2[]
// both of them are equal but because they are used in two different
// executions in parallel we need two different pbs_buffers.
masking_luts_1 = new int_radix_lut<Torus> *[params.message_modulus - 1];
masking_luts_2 = new int_radix_lut<Torus> *[params.message_modulus - 1];
for (int i = 0; i < params.message_modulus - 1; i++) {
uint32_t shifted_mask = i;
std::function<Torus(Torus)> lut_f_masking =
[shifted_mask](Torus x) -> Torus { return x & shifted_mask; };
masking_luts_1[i] =
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
masking_luts_2[i] =
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
Torus *luts[2] = {masking_luts_1[i]->lut, masking_luts_2[i]->lut};
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
stream, luts[j], params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus, lut_f_masking);
}
}
// create and generate message_extract_lut_1 and message_extract_lut_2
// both of them are equal but because they are used in two different
// executions in parallel we need two different pbs_buffers.
message_extract_lut_1 =
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
message_extract_lut_2 =
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
auto message_modulus = params.message_modulus;
auto lut_f_message_extract = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
Torus *luts[2] = {message_extract_lut_1->lut, message_extract_lut_2->lut};
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
stream, luts[j], params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus, lut_f_message_extract);
}
// Give name to closures to improve readability
auto overflow_happened = [](uint64_t overflow_sum) {
return overflow_sum != 0;
};
auto overflow_did_not_happen = [&overflow_happened](uint64_t overflow_sum) {
return !overflow_happened(overflow_sum);
};
// create and generate zero_out_if_overflow_did_not_happen
zero_out_if_overflow_did_not_happen = new int_radix_lut<Torus> *[2];
zero_out_if_overflow_did_not_happen[0] =
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
zero_out_if_overflow_did_not_happen[1] =
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
auto cur_lut_f = [&](Torus block, Torus overflow_sum) -> Torus {
if (overflow_did_not_happen(overflow_sum)) {
return 0;
} else {
return block;
}
};
generate_device_accumulator_bivariate_with_factor<Torus>(
stream, zero_out_if_overflow_did_not_happen[0]->lut,
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cur_lut_f, 2);
generate_device_accumulator_bivariate_with_factor<Torus>(
stream, zero_out_if_overflow_did_not_happen[1]->lut,
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cur_lut_f, 3);
// create and generate zero_out_if_overflow_happened
zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
zero_out_if_overflow_happened[0] =
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
zero_out_if_overflow_happened[1] =
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
auto overflow_happened_f = [&](Torus block, Torus overflow_sum) -> Torus {
if (overflow_happened(overflow_sum)) {
return 0;
} else {
return block;
}
};
generate_device_accumulator_bivariate_with_factor<Torus>(
stream, zero_out_if_overflow_happened[0]->lut, params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
overflow_happened_f, 2);
generate_device_accumulator_bivariate_with_factor<Torus>(
stream, zero_out_if_overflow_happened[1]->lut, params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
overflow_happened_f, 3);
// merge_overflow_flags_luts
merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
for (int i = 0; i < num_bits_in_message; i++) {
auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
return (x == 0 && y == 0) << i;
};
merge_overflow_flags_luts[i] =
new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
generate_device_accumulator_bivariate<Torus>(
stream, merge_overflow_flags_luts[i]->lut, params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_bit);
}
}
int_div_rem_memory(cuda_stream_t *stream, int_radix_params params,
uint32_t num_blocks, bool allocate_gpu_memory) {
this->params = params;
shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
stream, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, 2 * num_blocks, true);
shift_mem_2 = new int_logical_scalar_shift_buffer<Torus>(
stream, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, 2 * num_blocks, true);
overflow_sub_mem =
new int_overflowing_sub_memory<Torus>(stream, params, num_blocks, true);
comparison_buffer = new int_comparison_buffer<Torus>(
stream, COMPARISON_TYPE::NE, params, num_blocks, false, true);
init_lookup_tables(stream, num_blocks);
init_temporary_buffers(stream, num_blocks);
sub_stream_1 = new cuda_stream_t(stream->gpu_index);
sub_stream_2 = new cuda_stream_t(stream->gpu_index);
sub_stream_3 = new cuda_stream_t(stream->gpu_index);
sub_stream_4 = new cuda_stream_t(stream->gpu_index);
}
void release(cuda_stream_t *stream) {
uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);
// release and delete other operation memory objects
shift_mem_1->release(stream);
shift_mem_2->release(stream);
overflow_sub_mem->release(stream);
comparison_buffer->release(stream);
delete shift_mem_1;
delete shift_mem_2;
delete overflow_sub_mem;
delete comparison_buffer;
// release and delete lookup tables
// masking_luts_1 and masking_luts_2
for (int i = 0; i < params.message_modulus - 1; i++) {
masking_luts_1[i]->release(stream);
masking_luts_2[i]->release(stream);
delete masking_luts_1[i];
delete masking_luts_2[i];
}
delete[] masking_luts_1;
delete[] masking_luts_2;
// message_extract_lut_1 and message_extract_lut_2
message_extract_lut_1->release(stream);
message_extract_lut_2->release(stream);
delete message_extract_lut_1;
delete message_extract_lut_2;
// zero_out_if_overflow_did_not_happen
zero_out_if_overflow_did_not_happen[0]->release(stream);
zero_out_if_overflow_did_not_happen[1]->release(stream);
delete zero_out_if_overflow_did_not_happen[0];
delete zero_out_if_overflow_did_not_happen[1];
delete[] zero_out_if_overflow_did_not_happen;
// zero_out_if_overflow_happened
zero_out_if_overflow_happened[0]->release(stream);
zero_out_if_overflow_happened[1]->release(stream);
delete zero_out_if_overflow_happened[0];
delete zero_out_if_overflow_happened[1];
delete[] zero_out_if_overflow_happened;
// merge_overflow_flags_luts
for (int i = 0; i < num_bits_in_message; i++) {
merge_overflow_flags_luts[i]->release(stream);
delete merge_overflow_flags_luts[i];
}
delete[] merge_overflow_flags_luts;
// release sub streams
sub_stream_1->release();
sub_stream_2->release();
sub_stream_3->release();
sub_stream_4->release();
// drop temporary buffers
cuda_drop_async(remainder1, stream);
cuda_drop_async(remainder2, stream);
cuda_drop_async(numerator_block_stack, stream);
cuda_drop_async(numerator_block_1, stream);
cuda_drop_async(tmp_radix, stream);
cuda_drop_async(interesting_remainder1, stream);
cuda_drop_async(interesting_remainder2, stream);
cuda_drop_async(interesting_divisor, stream);
cuda_drop_async(divisor_ms_blocks, stream);
cuda_drop_async(new_remainder, stream);
cuda_drop_async(subtraction_overflowed, stream);
cuda_drop_async(did_not_overflow, stream);
cuda_drop_async(overflow_sum, stream);
cuda_drop_async(overflow_sum_radix, stream);
cuda_drop_async(tmp_1, stream);
cuda_drop_async(at_least_one_upper_block_is_non_zero, stream);
cuda_drop_async(cleaned_merged_interesting_remainder, stream);
}
};
template <typename Torus> struct int_bitop_buffer {
int_radix_params params;

View File

@@ -5,7 +5,7 @@
#include <cstdint>
enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
enum PBS_VARIANT { DEFAULT = 0, CG = 1 };
enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
extern "C" {
void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
@@ -111,6 +111,28 @@ get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_programmable_bootstrap_tbc(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // tbc
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
@@ -125,6 +147,11 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus>
__host__ bool
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory);
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
@@ -213,6 +240,54 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
polynomial_size / 2 * sizeof(double2),
stream);
} break;
#if CUDA_ARCH >= 900
case PBS_VARIANT::TBC: {
bool supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
Torus>(polynomial_size, max_shared_memory);
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
polynomial_size);
uint64_t minimum_sm_tbc = 0;
if (supports_dsm)
minimum_sm_tbc =
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
Torus>(polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
// There is a minimum amount of memory we need to run the TBC PBS, which
// is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
// because otherwise the previous check would have redirected
// computation to some other variant. If over that we don't have more
// partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
// not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
//
// NOSM mode actually requires minimum_sm_tbc shared memory bytes.
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
global_accumulator_fft = (double2 *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2),
stream);
} break;
#endif
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
@@ -281,6 +356,25 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory);
#if (CUDA_ARCH >= 900)
template <typename Torus>
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory);
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap_tbc(
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
#endif
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap_cg(
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
@@ -295,6 +389,13 @@ void scratch_cuda_programmable_bootstrap(
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t max_shared_memory);
#ifdef __CUDACC__
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,

View File

@@ -51,6 +51,37 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(cuda_stream_t *stream,
int8_t **pbs_buffer);
}
template <typename Torus>
__host__ bool
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory);
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory);
#if CUDA_ARCH >= 900
template <typename Torus, typename STorus>
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
template <typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size);
#endif
template <typename Torus, typename STorus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
@@ -113,12 +144,25 @@ template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
int8_t *d_mem_keybundle = NULL;
int8_t *d_mem_acc_step_one = NULL;
int8_t *d_mem_acc_step_two = NULL;
int8_t *d_mem_acc_cg = NULL;
int8_t *d_mem_acc_tbc = NULL;
double2 *keybundle_fft;
Torus *global_accumulator;
@@ -133,18 +177,20 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
this->pbs_variant = pbs_variant;
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
// default
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<
Torus>(polynomial_size);
uint64_t full_sm_accumulate_step_one =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
polynomial_size);
uint64_t partial_sm_accumulate_step_one =
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
Torus>(polynomial_size);
uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
polynomial_size);
uint64_t partial_sm_accumulate_step_one =
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
Torus>(polynomial_size);
// cg
uint64_t full_sm_cg_accumulate =
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
polynomial_size);
@@ -162,6 +208,19 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
auto num_blocks_acc_cg =
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
#if CUDA_ARCH >= 900
uint64_t full_sm_tbc_accumulate =
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t partial_sm_tbc_accumulate =
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t minimum_sm_tbc =
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
auto num_blocks_acc_tbc = num_blocks_acc_cg;
#endif
if (allocate_gpu_memory) {
// Keybundle
if (max_shared_memory < full_sm_keybundle)
@@ -169,7 +228,16 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
num_blocks_keybundle * full_sm_keybundle, stream);
switch (pbs_variant) {
case DEFAULT:
case PBS_VARIANT::CG:
// Accumulator CG
if (max_shared_memory < partial_sm_cg_accumulate)
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
num_blocks_acc_cg * full_sm_cg_accumulate, stream);
else if (max_shared_memory < full_sm_cg_accumulate)
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
num_blocks_acc_cg * partial_sm_cg_accumulate, stream);
break;
case PBS_VARIANT::DEFAULT:
// Accumulator step one
if (max_shared_memory < partial_sm_accumulate_step_one)
d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
@@ -183,15 +251,27 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
d_mem_acc_step_two = (int8_t *)cuda_malloc_async(
num_blocks_acc_step_two * full_sm_accumulate_step_two, stream);
break;
case CG:
// Accumulator CG
if (max_shared_memory < partial_sm_cg_accumulate)
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
num_blocks_acc_cg * full_sm_cg_accumulate, stream);
else if (max_shared_memory < full_sm_cg_accumulate)
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
num_blocks_acc_cg * partial_sm_cg_accumulate, stream);
#if CUDA_ARCH >= 900
case TBC:
// There is a minimum amount of memory we need to run the TBC PBS, which
// is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
// because otherwise the previous check would have redirected
// computation to some other variant. If over that we don't have more
// partial_sm_tbc_accumulate bytes, TBC PBS will run on NOSM. If we have
// partial_sm_tbc_accumulate but not full_sm_tbc_accumulate bytes, it
// will run on PARTIALSM. Otherwise, FULLSM.
//
// NOSM mode actually requires minimum_sm_tbc shared memory bytes.
// Accumulator TBC
if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc)
d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
num_blocks_acc_tbc * full_sm_tbc_accumulate, stream);
else if (max_shared_memory < full_sm_tbc_accumulate + minimum_sm_tbc)
d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
num_blocks_acc_tbc * partial_sm_tbc_accumulate, stream);
break;
#endif
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
@@ -200,7 +280,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
stream);
global_accumulator = (Torus *)cuda_malloc_async(
num_blocks_acc_step_two * polynomial_size * sizeof(Torus), stream);
num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream);
global_accumulator_fft = (double2 *)cuda_malloc_async(
num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
stream);
@@ -222,6 +302,12 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
if (d_mem_acc_cg)
cuda_drop_async(d_mem_acc_cg, stream);
break;
#if CUDA_ARCH >= 900
case TBC:
if (d_mem_acc_tbc)
cuda_drop_async(d_mem_acc_tbc, stream);
break;
#endif
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
@@ -232,10 +318,9 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
}
};
#ifdef __CUDACC__
__host__ uint32_t get_lwe_chunk_size(uint32_t ct_count);
#endif
template <typename Torus, class params>
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint32_t max_shared_memory);
#endif // CUDA_MULTI_BIT_H

View File

@@ -71,6 +71,18 @@ bool cuda_check_support_cooperative_groups() {
return cooperative_groups_supported > 0;
}
/// Returns
/// false if Thread Block Cluster is not supported.
/// true otherwise
bool cuda_check_support_thread_block_clusters() {
// To-do: Is this really the best way to check support?
int tbc_supported = 0;
check_cuda_error(
cudaDeviceGetAttribute(&tbc_supported, cudaDevAttrClusterLaunch, 0));
return tbc_supported > 0;
}
/// Copy memory to the GPU asynchronously
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream) {

View File

@@ -0,0 +1,83 @@
#include "integer/div_rem.cuh"
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
scratch_cuda_integer_div_rem_kb<uint64_t>(
stream, (int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
}
void cuda_integer_div_rem_radix_ciphertext_kb_64(
cuda_stream_t *stream, void *quotient, void *remainder, void *numerator,
void *divisor, int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks) {
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
switch (mem->params.polynomial_size) {
case 512:
host_integer_div_rem_kb<uint64_t, Degree<512>>(
stream, static_cast<uint64_t *>(quotient),
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
mem, num_blocks);
break;
case 1024:
host_integer_div_rem_kb<uint64_t, Degree<1024>>(
stream, static_cast<uint64_t *>(quotient),
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
mem, num_blocks);
break;
case 2048:
host_integer_div_rem_kb<uint64_t, Degree<2048>>(
stream, static_cast<uint64_t *>(quotient),
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
mem, num_blocks);
break;
case 4096:
host_integer_div_rem_kb<uint64_t, Degree<4096>>(
stream, static_cast<uint64_t *>(quotient),
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
mem, num_blocks);
break;
case 8192:
host_integer_div_rem_kb<uint64_t, Degree<8192>>(
stream, static_cast<uint64_t *>(quotient),
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
mem, num_blocks);
break;
case 16384:
host_integer_div_rem_kb<uint64_t, Degree<16384>>(
stream, static_cast<uint64_t *>(quotient),
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
mem, num_blocks);
break;
default:
PANIC("Cuda error (integer div_rem): unsupported polynomial size. "
"Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
}
}
void cleanup_cuda_integer_div_rem(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_div_rem_memory<uint64_t> *mem_ptr =
(int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(stream);
}

View File

@@ -0,0 +1,587 @@
#ifndef TFHE_RS_DIV_REM_CUH
#define TFHE_RS_DIV_REM_CUH
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.h"
#include "integer/comparison.cuh"
#include "integer/integer.cuh"
#include "integer/negation.cuh"
#include "integer/scalar_shifts.cuh"
#include "linear_algebra.h"
#include "programmable_bootstrap.h"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <fstream>
#include <iostream>
#include <omp.h>
#include <sstream>
#include <string>
#include <vector>
int ceil_div(int a, int b) { return (a + b - 1) / b; }
// struct makes it easier to use list of ciphertexts and move data between them
// struct does not allocate or drop any memory,
// keeps track on number of ciphertexts inside list.
template <typename Torus> struct lwe_ciphertext_list {
Torus *data;
size_t max_blocks;
size_t len;
int_radix_params params;
size_t big_lwe_size;
size_t radix_size;
size_t big_lwe_size_bytes;
size_t radix_size_bytes;
size_t big_lwe_dimension;
lwe_ciphertext_list(Torus *src, int_radix_params params, size_t max_blocks)
: data(src), params(params), max_blocks(max_blocks) {
big_lwe_size = params.big_lwe_dimension + 1;
big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
radix_size = max_blocks * big_lwe_size;
radix_size_bytes = radix_size * sizeof(Torus);
big_lwe_dimension = params.big_lwe_dimension;
len = max_blocks;
}
// copies ciphertexts from Torus*, starting from `starting_block` including
// `finish_block`, does not change the value of self len
void copy_from(Torus *src, size_t start_block, size_t finish_block,
cuda_stream_t *stream) {
size_t tmp_len = finish_block - start_block + 1;
cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
tmp_len * big_lwe_size_bytes, stream);
}
// copies ciphertexts from lwe_ciphertext_list, starting from `starting_block`
// including `finish_block`, does not change the value of self len
void copy_from(const lwe_ciphertext_list &src, size_t start_block,
size_t finish_block, cuda_stream_t *stream) {
copy_from(src.data, start_block, finish_block, stream);
}
// copies ciphertexts from Torus*, starting from `starting_block`
// including `finish_block`, updating the value of self len
void clone_from(Torus *src, size_t start_block, size_t finish_block,
cuda_stream_t *stream) {
len = finish_block - start_block + 1;
cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
len * big_lwe_size_bytes, stream);
}
// copies ciphertexts from ciphertexts_list, starting from `starting_block`
// including `finish_block`, updating the value of self len
void clone_from(const lwe_ciphertext_list &src, size_t start_block,
size_t finish_block, cuda_stream_t *stream) {
clone_from(src.data, start_block, finish_block, stream);
}
// assign zero to blocks starting from `start_block` including `finish_block`
void assign_zero(size_t start_block, size_t finish_block,
cuda_stream_t *stream) {
auto size = finish_block - start_block + 1;
cuda_memset_async(&data[start_block * big_lwe_size], 0,
size * big_lwe_size_bytes, stream);
}
// return pointer to last block
Torus *last_block() { return &data[(len - 1) * big_lwe_size]; }
// return pointer to first_block
Torus *first_block() { return data; }
// return block with `index`
Torus *get_block(size_t index) {
assert(index < len);
return &data[index * big_lwe_size];
}
bool is_empty() { return len == 0; }
// does not dop actual memory from `data`, only reduces value of `len` by one
void pop() {
if (len > 0)
len--;
else
assert(len > 0);
}
// insert ciphertext at index `ind`
void insert(size_t ind, Torus *ciphertext_block, cuda_stream_t *stream) {
assert(ind <= len);
assert(len < max_blocks);
size_t insert_offset = ind * big_lwe_size;
for (size_t i = len; i > ind; i--) {
Torus *src = &data[(i - 1) * big_lwe_size];
Torus *dst = &data[i * big_lwe_size];
cuda_memcpy_async_gpu_to_gpu(dst, src, big_lwe_size_bytes, stream);
}
cuda_memcpy_async_gpu_to_gpu(&data[insert_offset], ciphertext_block,
big_lwe_size_bytes, stream);
len++;
}
// push ciphertext at the end of `data`
void push(Torus *ciphertext_block, cuda_stream_t *stream) {
assert(len < max_blocks);
size_t offset = len * big_lwe_size;
cuda_memcpy_async_gpu_to_gpu(&data[offset], ciphertext_block,
big_lwe_size_bytes, stream);
len++;
}
// duplicate ciphertext into `number_of_blocks` ciphertexts
void fill_with_same_ciphertext(Torus *ciphertext, size_t number_of_blocks,
cuda_stream_t *stream) {
assert(number_of_blocks <= max_blocks);
for (size_t i = 0; i < number_of_blocks; i++) {
Torus *dest = &data[i * big_lwe_size];
cuda_memcpy_async_gpu_to_gpu(dest, ciphertext, big_lwe_size_bytes,
stream);
}
len = number_of_blocks;
}
// used for debugging, prints body of each ciphertext.
void print_blocks_body(const char *name) {
for (int i = 0; i < len; i++) {
print_debug(name, &data[i * big_lwe_size + big_lwe_dimension], 1);
}
}
};
template <typename Torus>
__host__ void scratch_cuda_integer_div_rem_kb(
cuda_stream_t *stream, int_div_rem_memory<Torus> **mem_ptr,
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_div_rem_memory<Torus>(stream, params, num_blocks,
allocate_gpu_memory);
}
template <typename Torus, class params>
__host__ void host_integer_div_rem_kb(cuda_stream_t *stream, Torus *quotient,
Torus *remainder, Torus *numerator,
Torus *divisor, void *bsk, uint64_t *ksk,
int_div_rem_memory<uint64_t> *mem_ptr,
uint32_t num_blocks) {
auto radix_params = mem_ptr->params;
auto big_lwe_dimension = radix_params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
uint32_t message_modulus = radix_params.message_modulus;
uint32_t carry_modulus = radix_params.carry_modulus;
uint32_t num_bits_in_message = 31 - __builtin_clz(message_modulus);
uint32_t total_bits = num_bits_in_message * num_blocks;
// put temporary buffers in lwe_ciphertext_list for easy use
lwe_ciphertext_list<Torus> remainder1(mem_ptr->remainder1, radix_params,
num_blocks);
lwe_ciphertext_list<Torus> remainder2(mem_ptr->remainder2, radix_params,
num_blocks);
lwe_ciphertext_list<Torus> numerator_block_stack(
mem_ptr->numerator_block_stack, radix_params, num_blocks);
lwe_ciphertext_list<Torus> numerator_block_1(mem_ptr->numerator_block_1,
radix_params, 1);
lwe_ciphertext_list<Torus> tmp_radix(mem_ptr->tmp_radix, radix_params,
num_blocks + 1);
lwe_ciphertext_list<Torus> interesting_remainder1(
mem_ptr->interesting_remainder1, radix_params, num_blocks + 1);
lwe_ciphertext_list<Torus> interesting_remainder2(
mem_ptr->interesting_remainder2, radix_params, num_blocks);
lwe_ciphertext_list<Torus> interesting_divisor(mem_ptr->interesting_divisor,
radix_params, num_blocks);
lwe_ciphertext_list<Torus> divisor_ms_blocks(mem_ptr->divisor_ms_blocks,
radix_params, num_blocks);
lwe_ciphertext_list<Torus> new_remainder(mem_ptr->new_remainder, radix_params,
num_blocks);
lwe_ciphertext_list<Torus> subtraction_overflowed(
mem_ptr->subtraction_overflowed, radix_params, 1);
lwe_ciphertext_list<Torus> did_not_overflow(mem_ptr->did_not_overflow,
radix_params, 1);
lwe_ciphertext_list<Torus> overflow_sum(mem_ptr->overflow_sum, radix_params,
1);
lwe_ciphertext_list<Torus> overflow_sum_radix(mem_ptr->overflow_sum_radix,
radix_params, num_blocks);
lwe_ciphertext_list<Torus> tmp_1(mem_ptr->tmp_1, radix_params, num_blocks);
lwe_ciphertext_list<Torus> at_least_one_upper_block_is_non_zero(
mem_ptr->at_least_one_upper_block_is_non_zero, radix_params, 1);
lwe_ciphertext_list<Torus> cleaned_merged_interesting_remainder(
mem_ptr->cleaned_merged_interesting_remainder, radix_params, num_blocks);
numerator_block_stack.clone_from(numerator, 0, num_blocks - 1, stream);
remainder1.assign_zero(0, num_blocks - 1, stream);
remainder2.assign_zero(0, num_blocks - 1, stream);
cuda_memset_async(quotient, 0, big_lwe_size_bytes * num_blocks, stream);
for (int i = total_bits - 1; i >= 0; i--) {
uint32_t block_of_bit = i / num_bits_in_message;
uint32_t pos_in_block = i % num_bits_in_message;
uint32_t msb_bit_set = total_bits - 1 - i;
uint32_t last_non_trivial_block = msb_bit_set / num_bits_in_message;
// Index to the first block of the remainder that is fully trivial 0
// and all blocks after it are also trivial zeros
// This number is in range 1..=num_bocks -1
uint32_t first_trivial_block = last_non_trivial_block + 1;
interesting_remainder1.clone_from(remainder1, 0, last_non_trivial_block,
stream);
interesting_remainder2.clone_from(remainder2, 0, last_non_trivial_block,
stream);
interesting_divisor.clone_from(divisor, 0, last_non_trivial_block, stream);
divisor_ms_blocks.clone_from(divisor,
(msb_bit_set + 1) / num_bits_in_message,
num_blocks - 1, stream);
// We split the divisor at a block position, when in reality the split
// should be at a bit position meaning that potentially (depending on
// msb_bit_set) the split versions share some bits they should not. So we do
// one PBS on the last block of the interesting_divisor, and first block of
// divisor_ms_blocks to trim out bits which should not be there
auto trim_last_interesting_divisor_bits = [&](cuda_stream_t *stream) {
if ((msb_bit_set + 1) % num_bits_in_message == 0) {
return;
}
// The last block of the interesting part of the remainder
// can contain bits which we should not account for
// we have to zero them out.
// Where the msb is set in the block
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
// e.g 2 bits in message:
// if pos_in_block is 0, then we want to keep only first bit (right
// shift
// mask by 1) if pos_in_block is 1, then we want to keep the two
// bits
// (right shift mask by 0)
uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1);
// Create mask of 1s on the message part, 0s in the carries
uint32_t full_message_mask = message_modulus - 1;
// Shift the mask so that we will only keep bits we should
uint32_t shifted_mask = full_message_mask >> shift_amount;
integer_radix_apply_univariate_lookup_table_kb(
stream, interesting_divisor.last_block(),
interesting_divisor.last_block(), bsk, ksk, 1,
mem_ptr->masking_luts_1[shifted_mask]);
}; // trim_last_interesting_divisor_bits
auto trim_first_divisor_ms_bits = [&](cuda_stream_t *stream) {
if (divisor_ms_blocks.is_empty() ||
((msb_bit_set + 1) % num_bits_in_message) == 0) {
return;
}
// Where the msb is set in the block
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
// e.g 2 bits in message:
// if pos_in_block is 0, then we want to discard the first bit (left shift
// mask by 1) if pos_in_block is 1, then we want to discard the two bits
// (left shift mask by 2) let shift_amount = num_bits_in_message -
// pos_in_block
uint32_t shift_amount = pos_in_block + 1;
uint32_t full_message_mask = message_modulus - 1;
uint32_t shifted_mask = full_message_mask << shift_amount;
// Keep the mask within the range of message bits, so that
// the estimated degree of the output is < msg_modulus
shifted_mask = shifted_mask & full_message_mask;
integer_radix_apply_univariate_lookup_table_kb(
stream, divisor_ms_blocks.first_block(),
divisor_ms_blocks.first_block(), bsk, ksk, 1,
mem_ptr->masking_luts_2[shifted_mask]);
}; // trim_first_divisor_ms_bits
// This does
// R := R << 1; R(0) := N(i)
//
// We could to that by left shifting, R by one, then unchecked_add the
// correct numerator bit.
//
// However, to keep the remainder clean (noise wise), what we do is that we
// put the remainder block from which we need to extract the bit, as the LSB
// of the Remainder, so that left shifting will pull the bit we need.
auto left_shift_interesting_remainder1 = [&](cuda_stream_t *stream3) {
numerator_block_1.clone_from(numerator_block_stack,
numerator_block_stack.len - 1,
numerator_block_stack.len - 1, stream3);
numerator_block_stack.pop();
interesting_remainder1.insert(0, numerator_block_1.first_block(),
stream3);
host_integer_radix_logical_scalar_shift_kb_inplace(
stream3, interesting_remainder1.data, 1, mem_ptr->shift_mem_1, bsk,
ksk, interesting_remainder1.len);
tmp_radix.clone_from(interesting_remainder1, 0,
interesting_remainder1.len - 1, stream3);
radix_blocks_rotate_left<<<interesting_remainder1.len, 256, 0,
stream3->stream>>>(
interesting_remainder1.data, tmp_radix.data, 1,
interesting_remainder1.len, big_lwe_size);
numerator_block_1.clone_from(interesting_remainder1,
interesting_remainder1.len - 1,
interesting_remainder1.len - 1, stream3);
interesting_remainder1.pop();
if (pos_in_block != 0) {
// We have not yet extracted all the bits from this numerator
// so, we put it back on the front so that it gets taken next iteration
numerator_block_stack.push(numerator_block_1.first_block(), stream3);
}
}; // left_shift_interesting_remainder1
auto left_shift_interesting_remainder2 = [&](cuda_stream_t *stream4) {
host_integer_radix_logical_scalar_shift_kb_inplace(
stream4, interesting_remainder2.data, 1, mem_ptr->shift_mem_2, bsk,
ksk, interesting_remainder2.len);
}; // left_shift_interesting_remainder2
stream->synchronize();
#pragma omp parallel sections
{
#pragma omp section
{
// interesting_divisor
trim_last_interesting_divisor_bits(mem_ptr->sub_stream_1);
}
#pragma omp section
{
// divisor_ms_blocks
trim_first_divisor_ms_bits(mem_ptr->sub_stream_2);
}
#pragma omp section
{
// interesting_remainder1
// numerator_block_stack
left_shift_interesting_remainder1(mem_ptr->sub_stream_3);
}
#pragma omp section
{
// interesting_remainder2
left_shift_interesting_remainder2(mem_ptr->sub_stream_4);
}
}
cuda_synchronize_stream(mem_ptr->sub_stream_1);
cuda_synchronize_stream(mem_ptr->sub_stream_2);
cuda_synchronize_stream(mem_ptr->sub_stream_3);
cuda_synchronize_stream(mem_ptr->sub_stream_4);
// if interesting_remainder1 != 0 -> interesting_remainder2 == 0
// if interesting_remainder1 == 0 -> interesting_remainder2 != 0
// In practice interesting_remainder1 contains the numerator bit,
// but in that position, interesting_remainder2 always has a 0
auto &merged_interesting_remainder = interesting_remainder1;
host_addition(stream, merged_interesting_remainder.data,
merged_interesting_remainder.data,
interesting_remainder2.data, radix_params.big_lwe_dimension,
merged_interesting_remainder.len);
// after create_clean_version_of_merged_remainder
// `merged_interesting_remainder` will be reused as
// `cleaned_merged_interesting_remainder`
cleaned_merged_interesting_remainder.clone_from(
merged_interesting_remainder, 0, merged_interesting_remainder.len - 1,
stream);
assert(merged_interesting_remainder.len == interesting_divisor.len);
// `new_remainder` is not initialized yet, so need to set length
new_remainder.len = merged_interesting_remainder.len;
// fills:
// `new_remainder` - radix ciphertext
// `subtraction_overflowed` - single ciphertext
auto do_overflowing_sub = [&](cuda_stream_t *stream) {
host_integer_overflowing_sub_kb<Torus, params>(
stream, new_remainder.data, subtraction_overflowed.data,
merged_interesting_remainder.data, interesting_divisor.data, bsk, ksk,
mem_ptr->overflow_sub_mem, merged_interesting_remainder.len);
};
// fills:
// `at_least_one_upper_block_is_non_zero` - single ciphertext
auto check_divisor_upper_blocks = [&](cuda_stream_t *stream) {
auto &trivial_blocks = divisor_ms_blocks;
if (trivial_blocks.is_empty()) {
cuda_memset_async(at_least_one_upper_block_is_non_zero.first_block(), 0,
big_lwe_size_bytes, stream);
} else {
// We could call unchecked_scalar_ne
// But we are in the special case where scalar == 0
// So we can skip some stuff
host_compare_with_zero_equality(
stream, tmp_1.data, trivial_blocks.data, mem_ptr->comparison_buffer,
bsk, ksk, trivial_blocks.len,
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
tmp_1.len =
ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);
is_at_least_one_comparisons_block_true(
stream, at_least_one_upper_block_is_non_zero.data, tmp_1.data,
mem_ptr->comparison_buffer, bsk, ksk, tmp_1.len);
}
};
// Creates a cleaned version (noise wise) of the merged remainder
// so that it can be safely used in bivariate PBSes
// fills:
// `cleaned_merged_interesting_remainder` - radix ciphertext
auto create_clean_version_of_merged_remainder = [&](cuda_stream_t *stream) {
integer_radix_apply_univariate_lookup_table_kb(
stream, cleaned_merged_interesting_remainder.data,
cleaned_merged_interesting_remainder.data, bsk, ksk,
cleaned_merged_interesting_remainder.len,
mem_ptr->message_extract_lut_1);
};
// phase 2
stream->synchronize();
#pragma omp parallel sections
{
#pragma omp section
{
// new_remainder
// subtraction_overflowed
do_overflowing_sub(mem_ptr->sub_stream_1);
}
#pragma omp section
{
// at_least_one_upper_block_is_non_zero
check_divisor_upper_blocks(mem_ptr->sub_stream_2);
}
#pragma omp section
{
// cleaned_merged_interesting_remainder
create_clean_version_of_merged_remainder(mem_ptr->sub_stream_3);
}
}
cuda_synchronize_stream(mem_ptr->sub_stream_1);
cuda_synchronize_stream(mem_ptr->sub_stream_2);
cuda_synchronize_stream(mem_ptr->sub_stream_3);
host_addition(stream, overflow_sum.data, subtraction_overflowed.data,
at_least_one_upper_block_is_non_zero.data,
radix_params.big_lwe_dimension, 1);
int factor = (i) ? 3 : 2;
int factor_lut_id = factor - 2;
overflow_sum_radix.fill_with_same_ciphertext(
overflow_sum.first_block(), cleaned_merged_interesting_remainder.len,
stream);
auto conditionally_zero_out_merged_interesting_remainder =
[&](cuda_stream_t *stream) {
integer_radix_apply_bivariate_lookup_table_kb_factor<Torus>(
stream, cleaned_merged_interesting_remainder.data,
cleaned_merged_interesting_remainder.data,
overflow_sum_radix.data, bsk, ksk,
cleaned_merged_interesting_remainder.len,
mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
factor);
};
auto conditionally_zero_out_merged_new_remainder =
[&](cuda_stream_t *stream) {
integer_radix_apply_bivariate_lookup_table_kb_factor<Torus>(
stream, new_remainder.data, new_remainder.data,
overflow_sum_radix.data, bsk, ksk, new_remainder.len,
mem_ptr->zero_out_if_overflow_happened[factor_lut_id], factor);
};
auto set_quotient_bit = [&](cuda_stream_t *stream) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, did_not_overflow.data, subtraction_overflowed.data,
at_least_one_upper_block_is_non_zero.data, bsk, ksk, 1,
mem_ptr->merge_overflow_flags_luts[pos_in_block]);
host_addition(stream, &quotient[block_of_bit * big_lwe_size],
&quotient[block_of_bit * big_lwe_size],
did_not_overflow.data, radix_params.big_lwe_dimension, 1);
};
stream->synchronize();
#pragma omp parallel sections
{
#pragma omp section
{
// cleaned_merged_interesting_remainder
conditionally_zero_out_merged_interesting_remainder(
mem_ptr->sub_stream_1);
}
#pragma omp section
{
// new_remainder
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_stream_2);
}
#pragma omp section
{
// quotient
set_quotient_bit(mem_ptr->sub_stream_3);
}
}
cuda_synchronize_stream(mem_ptr->sub_stream_1);
cuda_synchronize_stream(mem_ptr->sub_stream_2);
cuda_synchronize_stream(mem_ptr->sub_stream_3);
assert(first_trivial_block - 1 == cleaned_merged_interesting_remainder.len);
assert(first_trivial_block - 1 == new_remainder.len);
remainder1.copy_from(cleaned_merged_interesting_remainder, 0,
first_trivial_block - 1, stream);
remainder2.copy_from(new_remainder, 0, first_trivial_block - 1, stream);
}
assert(remainder1.len == remainder2.len);
// Clean the quotient and remainder
// as even though they have no carries, they are not at nominal noise level
host_addition(stream, remainder, remainder1.data, remainder2.data,
radix_params.big_lwe_dimension, remainder1.len);
stream->synchronize();
#pragma omp parallel sections
{
#pragma omp section
{
integer_radix_apply_univariate_lookup_table_kb(
mem_ptr->sub_stream_1, remainder, remainder, bsk, ksk, num_blocks,
mem_ptr->message_extract_lut_1);
}
#pragma omp section
{
integer_radix_apply_univariate_lookup_table_kb(
mem_ptr->sub_stream_2, quotient, quotient, bsk, ksk, num_blocks,
mem_ptr->message_extract_lut_2);
}
}
mem_ptr->sub_stream_1->synchronize();
mem_ptr->sub_stream_2->synchronize();
}
#endif // TFHE_RS_DIV_REM_CUH

View File

@@ -141,3 +141,41 @@ void cleanup_cuda_propagate_single_carry(cuda_stream_t *stream,
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(stream);
}
void scratch_cuda_apply_univariate_lut_kb_64(
cuda_stream_t *stream, int8_t **mem_ptr, void *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus);
scratch_cuda_apply_univariate_lut_kb<uint64_t>(
stream, (int_radix_lut<uint64_t> **)mem_ptr,
static_cast<uint64_t *>(input_lut), num_radix_blocks, params,
allocate_gpu_memory);
}
void cuda_apply_univariate_lut_kb_64(cuda_stream_t *stream,
void *output_radix_lwe,
void *input_radix_lwe, int8_t *mem_ptr,
void *ksk, void *bsk,
uint32_t num_blocks) {
host_apply_univariate_lut_kb<uint64_t>(
stream, static_cast<uint64_t *>(output_radix_lwe),
static_cast<uint64_t *>(input_radix_lwe),
(int_radix_lut<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk), bsk,
num_blocks);
}
void cleanup_cuda_apply_univariate_lut_kb_64(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(stream);
}

View File

@@ -172,6 +172,47 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
}
template <typename Torus>
__host__ void integer_radix_apply_bivariate_lookup_table_kb_factor(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
Torus *lwe_array_2, void *bsk, Torus *ksk, uint32_t num_radix_blocks,
int_radix_lut<Torus> *lut, uint32_t shift) {
cudaSetDevice(stream->gpu_index);
// apply_lookup_table_bivariate
auto params = lut->params;
auto pbs_type = params.pbs_type;
auto big_lwe_dimension = params.big_lwe_dimension;
auto small_lwe_dimension = params.small_lwe_dimension;
auto ks_level = params.ks_level;
auto ks_base_log = params.ks_base_log;
auto pbs_level = params.pbs_level;
auto pbs_base_log = params.pbs_base_log;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto grouping_factor = params.grouping_factor;
auto message_modulus = params.message_modulus;
// Left message is shifted
auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
pack_bivariate_blocks(stream, lwe_array_pbs_in, lut->lwe_trivial_indexes,
lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
big_lwe_dimension, shift, num_radix_blocks);
check_cuda_error(cudaGetLastError());
// Apply LUT
cuda_keyswitch_lwe_ciphertext_vector(
stream, lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, lwe_array_pbs_in,
lut->lwe_trivial_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
ks_base_log, ks_level, num_radix_blocks);
execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes_out, lut->lut,
lut->lut_indexes, lut->tmp_lwe_after_ks,
lut->lwe_trivial_indexes, bsk, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
}
// Rotates the slice in-place such that the first mid elements of the slice move
// to the end while the last array_length elements move to the front. After
// calling rotate_left, the element previously at index mid will become the
@@ -235,6 +276,24 @@ void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
message_modulus, carry_modulus, wrapped_f);
}
template <typename Torus>
void generate_lookup_table_bivariate_with_factor(
Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f, int factor) {
Torus factor_u64 = factor;
auto wrapped_f = [factor_u64, message_modulus, f](Torus input) -> Torus {
Torus lhs = (input / factor_u64) % message_modulus;
Torus rhs = (input % factor_u64) % message_modulus;
return f(lhs, rhs);
};
generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, wrapped_f);
}
/*
* generate bivariate accumulator for device pointer
* v_stream - cuda stream
@@ -266,7 +325,38 @@ void generate_device_accumulator_bivariate(
}
/*
* generate bivariate accumulator for device pointer
* generate bivariate accumulator with factor scaling for device pointer
* v_stream - cuda stream
* acc - device pointer for bivariate accumulator
* ...
* f - wrapping function with two Torus inputs
*/
template <typename Torus>
void generate_device_accumulator_bivariate_with_factor(
cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f, int factor) {
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
// fill bivariate accumulator
generate_lookup_table_bivariate_with_factor<Torus>(
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
factor);
// copy host lut and lut_indexes to device
cuda_memcpy_async_to_gpu(
acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
// Release memory when possible
cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
}
/*
* generate accumulator for device pointer
* v_stream - cuda stream
* acc - device pointer for accumulator
* ...
@@ -771,4 +861,31 @@ __host__ void reduce_signs(cuda_stream_t *stream, Torus *signs_array_out,
signs_a, bsk, ksk, 1, lut);
}
}
template <typename Torus>
void scratch_cuda_apply_univariate_lut_kb(cuda_stream_t *stream,
int_radix_lut<Torus> **mem_ptr,
Torus *input_lut,
uint32_t num_radix_blocks,
int_radix_params params,
bool allocate_gpu_memory) {
*mem_ptr = new int_radix_lut<Torus>(stream, params, 1, num_radix_blocks,
allocate_gpu_memory);
cuda_memcpy_async_to_gpu((*mem_ptr)->lut, input_lut,
(params.glwe_dimension + 1) *
params.polynomial_size * sizeof(Torus),
stream);
}
template <typename Torus>
void host_apply_univariate_lut_kb(cuda_stream_t *stream, Torus *radix_lwe_out,
Torus *radix_lwe_in,
int_radix_lut<Torus> *mem, Torus *ksk,
void *bsk, uint32_t num_blocks) {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, radix_lwe_out, radix_lwe_in, bsk, ksk, num_blocks, mem);
}
#endif // TFHE_RS_INTERNAL_INTEGER_CUH

View File

@@ -79,14 +79,20 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
grouping_factor, message_modulus, carry_modulus);
switch (polynomial_size) {
case 256:
case 512:
case 1024:
case 2048:
case 4096:
case 8192:
case 16384:
scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
stream, (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
allocate_gpu_memory);
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
"Only N = 2048 is supported")
"Supported N's are powers of two in the interval [256..16384].")
}
}
@@ -128,6 +134,30 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<256>>(
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 512:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<512>>(
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 1024:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<1024>>(
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 2048:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
stream, static_cast<uint64_t *>(radix_lwe_out),
@@ -136,9 +166,33 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 4096:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<4096>>(
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 8192:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<8192>>(
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 16384:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<16384>>(
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
"Only N = 2048 is supported")
"Supported N's are powers of two in the interval [256..16384].")
}
}
@@ -225,8 +279,8 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
num_radix_in_vec);
break;
default:
PANIC("Cuda error (integer sum ciphertexts): unsupported polynomial size. "
"Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
"Supported N's are powers of two in the interval [256..16384].")
}
free(terms_degree);

View File

@@ -91,12 +91,15 @@ all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
}
}
template <typename Torus>
template <typename Torus, sharedMemDegree SMD>
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
uint32_t chunk_size, uint32_t block_size,
uint32_t num_blocks) {
extern __shared__ Torus result[];
extern __shared__ int8_t sharedmem[];
Torus *result = (Torus *)sharedmem;
size_t stride = blockDim.x;
size_t chunk_id = blockIdx.x;
size_t chunk_elem_size = chunk_size * num_blocks * block_size;
@@ -106,6 +109,9 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
size_t block_stride = blockIdx.y * block_size;
auto dst_block = &dst_radix[block_stride];
if constexpr (SMD == NOSM)
result = dst_block;
// init shared mem with first radix of chunk
size_t tid = threadIdx.x;
for (int i = tid; i < block_size; i += stride) {
@@ -121,9 +127,9 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
}
// put result from shared mem to global mem
for (int i = tid; i < block_size; i += stride) {
dst_block[i] = result[i];
}
if constexpr (SMD == FULLSM)
for (int i = tid; i < block_size; i += stride)
dst_block[i] = result[i];
}
template <typename Torus, class params>
@@ -181,11 +187,20 @@ __host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(
cudaSetDevice(stream->gpu_index);
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
if (sm_size < cuda_get_max_shared_memory(stream->gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<Torus, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
check_cuda_error(cudaGetLastError());
}
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
stream, params, num_blocks_in_radix, max_num_radix_in_vec,
allocate_gpu_memory);
@@ -203,11 +218,6 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
auto old_blocks = mem_ptr->old_blocks;
auto small_lwe_vector = mem_ptr->small_lwe_vector;
auto luts_message_carry = mem_ptr->luts_message_carry;
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
@@ -247,8 +257,16 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
dim3 add_grid(ch_amount, num_blocks, 1);
size_t sm_size = big_lwe_size * sizeof(Torus);
tree_add_chunks<Torus><<<add_grid, 512, sm_size, stream->stream>>>(
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
if (sm_size < max_shared_memory)
tree_add_chunks<Torus, FULLSM>
<<<add_grid, 512, sm_size, stream->stream>>>(
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size,
num_blocks);
else
tree_add_chunks<Torus, NOSM><<<add_grid, 512, 0, stream->stream>>>(
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
check_cuda_error(cudaGetLastError());
size_t total_count = 0;
size_t message_count = 0;
@@ -260,6 +278,34 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
total_count, message_count, carry_count, sm_copy_count);
// create lut object for message and carry
// we allocate luts_message_carry in the host function (instead of scratch)
// to reduce average memory consumption
auto luts_message_carry =
new int_radix_lut<Torus>(stream, mem_ptr->params, 2, total_count, true);
auto message_acc = luts_message_carry->get_lut(0);
auto carry_acc = luts_message_carry->get_lut(1);
// define functions for each accumulator
auto lut_f_message = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
return x / message_modulus;
};
// generate accumulators
generate_device_accumulator<Torus>(stream, message_acc, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, lut_f_message);
generate_device_accumulator<Torus>(stream, carry_acc, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, lut_f_carry);
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
size_t copy_size = total_count * sizeof(Torus);
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size, stream);
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size, stream);
@@ -272,6 +318,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
smart_copy<<<sm_copy_count, 256, 0, stream->stream>>>(
new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
big_lwe_size);
check_cuda_error(cudaGetLastError());
if (carry_count > 0)
cuda_set_value_async<Torus>(
@@ -291,6 +338,8 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor, total_count,
2, 0, max_shared_memory, mem_ptr->params.pbs_type);
luts_message_carry->release(stream);
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
int new_blocks_created = 2 * ch_amount * num_blocks;
copy_size = rem_blocks * big_lwe_size * sizeof(Torus);
@@ -386,6 +435,7 @@ __host__ void host_integer_mult_radix_kb(
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, stream->stream>>>(
radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
vector_lsb_rhs, vector_msb_rhs, num_blocks);
check_cuda_error(cudaGetLastError());
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, block_mul_res, block_mul_res, vector_result_sb, bsk, ksk,
@@ -401,6 +451,7 @@ __host__ void host_integer_mult_radix_kb(
vector_result_msb, glwe_dimension,
lsb_vector_block_count, msb_vector_block_count,
num_blocks);
check_cuda_error(cudaGetLastError());
int terms_degree[2 * num_blocks * num_blocks];
for (int i = 0; i < num_blocks * num_blocks; i++) {
@@ -427,11 +478,20 @@ __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
if (sm_size < cuda_get_max_shared_memory(stream->gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<Torus, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
check_cuda_error(cudaGetLastError());
}
*mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
allocate_gpu_memory);

View File

@@ -35,11 +35,20 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(
cudaSetDevice(stream->gpu_index);
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(T);
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<T>, cudaFuncAttributeMaxDynamicSharedMemorySize,
sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<T>, cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
if (sm_size < cuda_get_max_shared_memory(stream->gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<T, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize,
sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<T, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(
cudaFuncSetAttribute(tree_add_chunks<T, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(tree_add_chunks<T, NOSM>, cudaFuncCachePreferL1);
check_cuda_error(cudaGetLastError());
}
*mem_ptr = new int_scalar_mul_buffer<T>(stream, params, num_radix_blocks,
allocate_gpu_memory);

View File

@@ -1 +1,44 @@
#include "programmable_bootstrap.cuh"
template <>
__device__ int get_this_block_rank(grid_group &group, bool support_dsm) {
return blockIdx.y;
}
template <>
__device__ double2 *
get_join_buffer_element(int level_id, int glwe_id, grid_group &group,
double2 *global_memory_buffer, uint32_t polynomial_size,
uint32_t glwe_dimension, bool support_dsm) {
double2 *buffer_slice =
global_memory_buffer +
(glwe_id + level_id * (glwe_dimension + 1)) * polynomial_size / 2;
return buffer_slice;
}
#if CUDA_ARCH >= 900
template <>
__device__ int get_this_block_rank(cluster_group &cluster, bool support_dsm) {
if (support_dsm)
return cluster.block_rank();
else
return blockIdx.y;
}
template <>
__device__ double2 *
get_join_buffer_element(int level_id, int glwe_id, cluster_group &cluster,
double2 *global_memory_buffer, uint32_t polynomial_size,
uint32_t glwe_dimension, bool support_dsm) {
double2 *buffer_slice;
if (support_dsm) {
extern __shared__ double2 smem[];
buffer_slice = cluster.map_shared_rank(
smem, glwe_id + level_id * (glwe_dimension + 1));
} else {
buffer_slice =
global_memory_buffer +
(glwe_id + level_id * (glwe_dimension + 1)) * polynomial_size / 2;
}
return buffer_slice;
}
#endif

View File

@@ -1,8 +1,120 @@
#include "../../include/device.h"
#include "../../include/programmable_bootstrap.h"
#include "../include/device.h"
#include "programmable_bootstrap_classic.cuh"
#include "programmable_bootstrap_multibit.cuh"
#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
#define CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "programmable_bootstrap.h"
#include "programmable_bootstrap_multibit.h"
#include "cooperative_groups.h"
using namespace cooperative_groups;
namespace cg = cooperative_groups;
template <typename G>
__device__ int get_this_block_rank(G &group, bool support_dsm);
template <typename G>
__device__ double2 *
get_join_buffer_element(int level_id, int glwe_id, G &group,
double2 *global_memory_buffer, uint32_t polynomial_size,
uint32_t glwe_dimension, bool support_dsm);
template <typename Torus, typename G, class params>
__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
double2 *join_buffer, double2 *bootstrapping_key,
int polynomial_size, uint32_t glwe_dimension,
int level_count, int iteration, G &group,
bool support_dsm = false) {
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(fft);
synchronize_threads_in_block();
// Get the pieces of the bootstrapping key that will be needed for the
// external product; blockIdx.x is the ID of the block that's executing
// this function, so we end up getting the lines of the bootstrapping key
// needed to perform the external product in this block (corresponding to
// the same decomposition level)
auto bsk_slice = get_ith_mask_kth_block(
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
glwe_dimension, level_count);
// Perform the matrix multiplication between the GGSW and the GLWE,
// each block operating on a single level for mask and body
// The first product is used to initialize level_join_buffer
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
auto buffer_slice =
get_join_buffer_element<G>(blockIdx.x, blockIdx.y, group, join_buffer,
polynomial_size, glwe_dimension, support_dsm);
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
group.sync();
// Continues multiplying fft by every polynomial in that particular bsk level
// Each y-block accumulates in a different polynomial at each iteration
for (int j = 1; j < (glwe_dimension + 1); j++) {
int idx = (j + this_block_rank) % (glwe_dimension + 1);
auto bsk_poly = bsk_slice + idx * params::degree / 2;
auto buffer_slice = get_join_buffer_element<G>(blockIdx.x, idx, group,
join_buffer, polynomial_size,
glwe_dimension, support_dsm);
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
group.sync();
}
// -----------------------------------------------------------------
// All blocks are synchronized here; after this sync, level_join_buffer has
// the values needed from every other block
auto src_acc =
get_join_buffer_element<G>(0, blockIdx.y, group, join_buffer,
polynomial_size, glwe_dimension, support_dsm);
// copy first product into fft buffer
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = src_acc[tid];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// accumulate rest of the products into fft buffer
for (int l = 1; l < gridDim.x; l++) {
auto cur_src_acc = get_join_buffer_element<G>(l, blockIdx.y, group,
join_buffer, polynomial_size,
glwe_dimension, support_dsm);
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] += cur_src_acc[tid];
tid += params::degree / params::opt;
}
}
synchronize_threads_in_block();
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
// accumulator
NSMFFT_inverse<HalfDegree<params>>(fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(fft, accumulator);
__syncthreads();
}
template <typename Torus>
void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
@@ -112,3 +224,5 @@ void execute_scratch_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
"moduli are supported.")
}
}
#endif

View File

@@ -67,8 +67,7 @@ void scratch_cuda_programmable_bootstrap_amortized_32(
break;
default:
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
"N's are powers of two in the interval [256..16384].")
}
}

View File

@@ -7,7 +7,6 @@
#endif
#include "cooperative_groups.h"
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "device.h"
@@ -15,104 +14,13 @@
#include "fft/twiddles.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.cuh"
#include "programmable_bootstrap.h"
#include "types/complex/operations.cuh"
// Cooperative groups are used for this implementation
using namespace cooperative_groups;
namespace cg = cooperative_groups;
template <typename Torus, class params>
__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
double2 *join_buffer, double2 *bootstrapping_key,
int polynomial_size, uint32_t glwe_dimension,
int level_count, int iteration,
grid_group &grid) {
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(fft);
synchronize_threads_in_block();
// Get the pieces of the bootstrapping key that will be needed for the
// external product; blockIdx.x is the ID of the block that's executing
// this function, so we end up getting the lines of the bootstrapping key
// needed to perform the external product in this block (corresponding to
// the same decomposition level)
auto bsk_slice = get_ith_mask_kth_block(
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
glwe_dimension, level_count);
// Selects all GLWEs in a particular decomposition level
auto level_join_buffer =
join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2;
// Perform the matrix multiplication between the GGSW and the GLWE,
// each block operating on a single level for mask and body
// The first product is used to initialize level_join_buffer
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2;
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
grid.sync();
// Continues multiplying fft by every polynomial in that particular bsk level
// Each y-block accumulates in a different polynomial at each iteration
for (int j = 1; j < (glwe_dimension + 1); j++) {
int idx = (j + blockIdx.y) % (glwe_dimension + 1);
auto bsk_poly = bsk_slice + idx * params::degree / 2;
auto buffer_slice = level_join_buffer + idx * params::degree / 2;
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
grid.sync();
}
// -----------------------------------------------------------------
// All blocks are synchronized here; after this sync, level_join_buffer has
// the values needed from every other block
auto src_acc = join_buffer + blockIdx.y * params::degree / 2;
// copy first product into fft buffer
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = src_acc[tid];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// accumulate rest of the products into fft buffer
for (int l = 1; l < gridDim.x; l++) {
auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2];
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] += cur_src_acc[tid];
tid += params::degree / params::opt;
}
}
synchronize_threads_in_block();
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
// accumulator
NSMFFT_inverse<HalfDegree<params>>(fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(fft, accumulator);
__syncthreads();
}
/*
* Kernel that computes the classical PBS using cooperative groups
*
@@ -222,7 +130,7 @@ __global__ void device_programmable_bootstrap_cg(
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, params>(
mul_ggsw_glwe<Torus, grid_group, params>(
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
polynomial_size, glwe_dimension, level_count, i, grid);

View File

@@ -1,5 +1,5 @@
#ifndef CUDA_FAST_MULTIBIT_PBS_CUH
#define CUDA_FAST_MULTIBIT_PBS_CUH
#ifndef CUDA_CG_MULTIBIT_PBS_CUH
#define CUDA_CG_MULTIBIT_PBS_CUH
#include "cooperative_groups.h"
#include "crypto/gadget.cuh"
@@ -11,6 +11,7 @@
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.cuh"
#include "programmable_bootstrap.h"
#include "programmable_bootstrap_multibit.cuh"
#include "types/complex/operations.cuh"
@@ -106,9 +107,9 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, params>(accumulator, accumulator_fft,
block_join_buffer, keybundle, polynomial_size,
glwe_dimension, level_count, i, grid);
mul_ggsw_glwe<Torus, grid_group, params>(
accumulator, accumulator_fft, block_join_buffer, keybundle,
polynomial_size, glwe_dimension, level_count, i, grid);
synchronize_threads_in_block();
}
@@ -240,7 +241,9 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
}
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size(input_lwe_ciphertext_count);
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
stream->gpu_index, input_lwe_ciphertext_count, polynomial_size,
max_shared_memory);
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
stream, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
@@ -248,7 +251,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
}
template <typename Torus, class params>
__host__ void execute_external_product_loop(
__host__ void execute_cg_external_product_loop(
cuda_stream_t *stream, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lwe_array_out,
Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
@@ -336,7 +339,8 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
cudaSetDevice(stream->gpu_index);
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size(num_samples);
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
stream->gpu_index, num_samples, polynomial_size, max_shared_memory);
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
@@ -349,7 +353,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
lwe_chunk_size, lwe_offset);
// Accumulate
execute_external_product_loop<Torus, params>(
execute_cg_external_product_loop<Torus, params>(
stream, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,

View File

@@ -1,5 +1,8 @@
#include "programmable_bootstrap_cg_classic.cuh"
#include "programmable_bootstrap_classic.cuh"
#if (CUDA_ARCH >= 900)
#include "programmable_bootstrap_tbc_classic.cuh"
#endif
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
@@ -12,6 +15,176 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
max_shared_memory);
}
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory) {
#if CUDA_ARCH >= 900
switch (polynomial_size) {
case 256:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 512:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 1024:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 2048:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 4096:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 8192:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 16384:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
#else
return false;
#endif
}
#if (CUDA_ARCH >= 900)
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap_tbc(
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<256>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<512>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<1024>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<2048>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<4096>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<8192>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 16384:
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<16384>>(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..16384].")
}
}
template <typename Torus>
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 512:
host_programmable_bootstrap_tbc<Torus, Degree<512>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 1024:
host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 2048:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 4096:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 8192:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 16384:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..16384].")
}
}
#endif
/*
* Returns the buffer size for 64 bits executions
*/
@@ -143,9 +316,19 @@ void scratch_cuda_programmable_bootstrap_32(
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
#if (CUDA_ARCH >= 900)
if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
level_count, max_shared_memory))
scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
stream, (pbs_buffer<uint32_t, CLASSICAL> **)buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
else
#endif
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
scratch_cuda_programmable_bootstrap_cg<uint32_t, int32_t>(
stream, (pbs_buffer<uint32_t, CLASSICAL> **)buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
@@ -168,9 +351,19 @@ void scratch_cuda_programmable_bootstrap_64(
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
#if (CUDA_ARCH >= 900)
if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
level_count, max_shared_memory))
scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
stream, (pbs_buffer<uint64_t, CLASSICAL> **)buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
else
#endif
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
scratch_cuda_programmable_bootstrap_cg<uint64_t, int64_t>(
stream, (pbs_buffer<uint64_t, CLASSICAL> **)buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
@@ -321,7 +514,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
@@ -330,9 +523,28 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
PANIC("Cuda error (classical PBS): base log should be > number of bits "
"in the ciphertext representation (32)");
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
glwe_dimension, polynomial_size, level_count, num_samples,
max_shared_memory))
pbs_buffer<uint64_t, CLASSICAL> *buffer =
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
switch (buffer->pbs_variant) {
case TBC:
#if CUDA_ARCH >= 900
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key),
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, lwe_idx, max_shared_memory);
#else
PANIC("Cuda error (PBS): TBC pbs is not supported.")
#endif
break;
case CG:
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
@@ -344,7 +556,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, lwe_idx, max_shared_memory);
else
break;
case DEFAULT:
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
@@ -356,6 +569,10 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, lwe_idx, max_shared_memory);
break;
default:
PANIC("Cuda error (PBS): unknown pbs variant.")
}
}
/* Perform bootstrapping on a batch of input u64 LWE ciphertexts.
@@ -433,7 +650,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
@@ -441,9 +658,28 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
PANIC("Cuda error (classical PBS): base log should be > number of bits "
"in the ciphertext representation (64)");
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
glwe_dimension, polynomial_size, level_count, num_samples,
max_shared_memory))
pbs_buffer<uint64_t, CLASSICAL> *buffer =
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
switch (buffer->pbs_variant) {
case PBS_VARIANT::TBC:
#if (CUDA_ARCH >= 900)
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key),
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, lwe_idx, max_shared_memory);
#else
PANIC("Cuda error (PBS): TBC pbs is not supported.")
#endif
break;
case PBS_VARIANT::CG:
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
@@ -455,7 +691,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, lwe_idx, max_shared_memory);
else
break;
case PBS_VARIANT::DEFAULT:
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
@@ -467,6 +704,10 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, lwe_idx, max_shared_memory);
break;
default:
PANIC("Cuda error (PBS): unknown pbs variant.")
}
}
/*
@@ -546,3 +787,41 @@ template void scratch_cuda_programmable_bootstrap<uint32_t, int32_t>(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory);
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory);
#if CUDA_ARCH >= 900
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
cuda_stream_t *stream, uint32_t *lwe_array_out,
uint32_t *lwe_output_indexes, uint32_t *lut_vector,
uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory);
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
cuda_stream_t *stream, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory);
template void scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
cuda_stream_t *stream, pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template void scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
cuda_stream_t *stream, pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
#endif

View File

@@ -3,6 +3,10 @@
#include "programmable_bootstrap_multibit.cuh"
#include "programmable_bootstrap_multibit.h"
#if (CUDA_ARCH >= 900)
#include "programmable_bootstrap_tbc_multibit.cuh"
#endif
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory) {
@@ -11,6 +15,57 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
max_shared_memory);
}
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory) {
#if CUDA_ARCH >= 900
switch (polynomial_size) {
case 256:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 512:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 1024:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 2048:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 4096:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 8192:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 16384:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
#else
return false;
#endif
}
template <typename Torus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
@@ -185,15 +240,34 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
glwe_dimension, polynomial_size, level_count, num_samples,
max_shared_memory))
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
switch (buffer->pbs_variant) {
case PBS_VARIANT::TBC:
#if CUDA_ARCH >= 900
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key),
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
#else
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
#endif
break;
case PBS_VARIANT::CG:
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
@@ -201,11 +275,11 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key),
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
else
break;
case PBS_VARIANT::DEFAULT:
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
@@ -213,10 +287,13 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key),
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
}
}
template <typename Torus, typename STorus>
@@ -356,9 +433,20 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t lwe_chunk_size) {
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
#if (CUDA_ARCH >= 900)
if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
level_count, max_shared_memory))
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count, grouping_factor,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
else
#endif
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
uint64_t>(glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count, grouping_factor,
@@ -378,25 +466,69 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(cuda_stream_t *stream,
x->release(stream);
}
// Returns a chunk size that is not optimal but close to
__host__ uint32_t get_lwe_chunk_size(uint32_t ct_count) {
/**
* Computes divisors of the product of num_sms (streaming multiprocessors on the
* GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
* device_multi_bit_programmable_bootstrap_keybundle) smaller than its square
* root, based on max_num_pbs. If log2(max_num_pbs) <= 13, selects the first
* suitable divisor. If greater, calculates an offset as max(1,log2(max_num_pbs)
* - 13) for additional logic.
*
* The value 13 was empirically determined based on memory requirements for
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
*/
template <typename Torus, class params>
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint32_t max_shared_memory) {
#if CUDA_ARCH >= 900
// Tesla H100
return (ct_count > 10000) ? 30 : 64;
#elif CUDA_ARCH >= 890
// Tesla RTX4090
return 8;
#elif CUDA_ARCH >= 800
// Tesla A100
return (ct_count > 10000) ? 30 : 45;
#elif CUDA_ARCH >= 700
// Tesla V100
return (ct_count > 10000) ? 12 : 18;
#else
// Generic case
return (ct_count > 10000) ? 2 : 1;
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
polynomial_size);
int max_blocks_per_sm;
if (max_shared_memory < full_sm_keybundle)
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm,
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
polynomial_size / params::opt, full_sm_keybundle);
else
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm,
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
polynomial_size / params::opt, 0);
int num_sms = 0;
check_cuda_error(cudaDeviceGetAttribute(
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
int x = num_sms * max_blocks_per_sm;
int count = 0;
int divisor = 1;
int ith_divisor = 0;
#if CUDA_ARCH < 900
// We pick a smaller divisor on GPUs other than H100, so 256-bit integer
// multiplication can run
int log2_max_num_pbs = std::log2(max_num_pbs);
if (log2_max_num_pbs > 13)
ith_divisor = log2_max_num_pbs - 11;
#endif
for (int i = sqrt(x); i >= 1; i--) {
if (x % i == 0) {
if (count == ith_divisor) {
divisor = i;
break;
} else {
count++;
}
}
}
return divisor;
}
template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
@@ -437,3 +569,180 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size);
template bool
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory);
#if (CUDA_ARCH >= 900)
template <typename Torus, typename STorus>
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
switch (polynomial_size) {
case 256:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<256>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 512:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<512>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 1024:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<1024>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 2048:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<2048>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 4096:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<4096>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 8192:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<8192>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 16384:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<16384>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
template <typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size) {
if (base_log > 64)
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
"the ciphertext representation (64)");
switch (polynomial_size) {
case 256:
host_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t,
AmortizedDegree<256>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 512:
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<512>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 1024:
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<1024>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 2048:
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<2048>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 4096:
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<4096>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 8192:
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<8192>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 16384:
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<16384>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
template void
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
template void
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
cuda_stream_t *stream, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size);
#endif

View File

@@ -227,8 +227,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
gadget_acc.decompose_and_compress_next_polynomial(accumulator_fft,
blockIdx.x);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
@@ -469,7 +468,9 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
}
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size(input_lwe_ciphertext_count);
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
stream->gpu_index, input_lwe_ciphertext_count, polynomial_size,
max_shared_memory);
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
stream, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
@@ -630,7 +631,8 @@ __host__ void host_multi_bit_programmable_bootstrap(
// If a chunk size is not passed to this function, select one.
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size(num_samples);
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
stream->gpu_index, num_samples, polynomial_size, max_shared_memory);
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {

View File

@@ -0,0 +1,424 @@
#ifndef CUDA_TBC_PBS_CUH
#define CUDA_TBC_PBS_CUH
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "cooperative_groups.h"
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.cuh"
#include "programmable_bootstrap.h"
#include "types/complex/operations.cuh"
using namespace cooperative_groups;
namespace cg = cooperative_groups;
/*
* Kernel that computes the classical PBS using cooperative groups
*
* - lwe_array_out: vector of output lwe s, with length
* (glwe_dimension * polynomial_size+1)*num_samples
* - lut_vector: vector of look up tables with
* length (glwe_dimension+1) * polynomial_size * num_samples
* - lut_vector_indexes: mapping between lwe_array_in and lut_vector
* lwe_array_in: vector of lwe inputs with length (lwe_dimension + 1) *
* num_samples
*
* Each y-block computes one element of the lwe_array_out.
*/
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_programmable_bootstrap_tbc(
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
int8_t *device_mem, uint64_t device_memory_size_per_block,
bool support_dsm) {
cluster_group cluster = this_cluster();
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
uint32_t glwe_dimension = gridDim.y - 1;
if constexpr (SMD == FULLSM) {
selected_memory = sharedmem;
if (support_dsm)
selected_memory += sizeof(Torus) * polynomial_size;
} else {
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
blockIdx.z * gridDim.x * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
Torus *accumulator = (Torus *)selected_memory;
Torus *accumulator_rotated =
(Torus *)accumulator + (ptrdiff_t)polynomial_size;
double2 *accumulator_fft =
(double2 *)accumulator_rotated +
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
if constexpr (SMD == PARTIALSM) {
accumulator_fft = (double2 *)sharedmem;
if (support_dsm)
accumulator_fft += (ptrdiff_t)(polynomial_size / 2);
}
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
params::degree * (glwe_dimension + 1)];
double2 *block_join_buffer =
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
params::degree / 2];
// Since the space is L1 cache is small, we use the same memory location for
// the rotated accumulator and the fft accumulator, since we know that the
// rotated array is not in use anymore by the time we perform the fft
// Put "b" in [0, 2N[
Torus b_hat = 0;
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
2 * params::degree);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
for (int i = 0; i < lwe_dimension; i++) {
synchronize_threads_in_block();
// Put "a" in [0, 2N[
Torus a_hat = 0;
rescale_torus_element(block_lwe_array_in[i], a_hat,
2 * params::degree); // 2 * params::log2_degree + 1);
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, level_count);
synchronize_threads_in_block();
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
accumulator_rotated);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, cluster_group, params>(
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
synchronize_threads_in_block();
}
auto block_lwe_array_out =
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result, but
// we do the computation at block 0 to avoid waiting for extra blocks, in
// case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
}
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_programmable_bootstrap_tbc(
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
bool supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
Torus>(polynomial_size, max_shared_memory);
uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
polynomial_size);
uint64_t minimum_sm_tbc = 0;
if (supports_dsm)
minimum_sm_tbc =
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
polynomial_size);
if (max_shared_memory >= full_sm + minimum_sm_tbc) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm + minimum_sm_tbc));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm + minimum_sm_tbc) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
partial_sm + minimum_sm_tbc));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, minimum_sm_tbc));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
*buffer = new pbs_buffer<Torus, CLASSICAL>(
stream, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, PBS_VARIANT::TBC, allocate_gpu_memory);
}
/*
* Host wrapper
*/
template <typename Torus, class params>
__host__ void host_programmable_bootstrap_tbc(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t num_luts, uint32_t max_shared_memory) {
cudaSetDevice(stream->gpu_index);
auto supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
Torus>(polynomial_size, max_shared_memory);
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
polynomial_size);
uint64_t minimum_sm_tbc = 0;
if (supports_dsm)
minimum_sm_tbc =
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t full_dm = full_sm;
uint64_t partial_dm = full_dm - partial_sm;
int8_t *d_mem = buffer->d_mem;
double2 *buffer_fft = buffer->global_accumulator_fft;
int thds = polynomial_size / params::opt;
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
cudaLaunchConfig_t config = {0};
// The grid dimension is not affected by cluster launch, and is still
// enumerated using number of blocks. The grid dimension should be a multiple
// of cluster size.
config.gridDim = grid;
config.blockDim = thds;
cudaLaunchAttribute attribute[1];
attribute[0].id = cudaLaunchAttributeClusterDimension;
attribute[0].val.clusterDim.x = level_count; // Cluster size in X-dimension
attribute[0].val.clusterDim.y = (glwe_dimension + 1);
attribute[0].val.clusterDim.z = 1;
config.attrs = attribute;
config.numAttrs = 1;
config.stream = stream->stream;
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
config.dynamicSmemBytes = minimum_sm_tbc;
check_cuda_error(cudaLaunchKernelEx(
&config, device_programmable_bootstrap_tbc<Torus, params, NOSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
supports_dsm));
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;
check_cuda_error(cudaLaunchKernelEx(
&config, device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
partial_dm, supports_dsm));
} else {
config.dynamicSmemBytes = full_sm + minimum_sm_tbc;
check_cuda_error(cudaLaunchKernelEx(
&config, device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
supports_dsm));
}
}
// Verify if the grid size satisfies the cooperative group constraints
template <typename Torus, class params>
__host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
int glwe_dimension, int level_count, int num_samples,
uint32_t max_shared_memory) {
// If Cooperative Groups is not supported, no need to check anything else
if (!cuda_check_support_cooperative_groups())
return false;
// Calculate the dimension of the kernel
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(params::degree);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
params::degree);
int thds = params::degree / params::opt;
// Get the maximum number of active blocks per streaming multiprocessors
int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
int max_active_blocks_per_sm;
if (max_shared_memory < partial_sm) {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_programmable_bootstrap_tbc<Torus, params, NOSM>, thds,
0);
} else if (max_shared_memory < full_sm) {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
thds, partial_sm);
} else {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_programmable_bootstrap_tbc<Torus, params, FULLSM>, thds,
full_sm);
}
// Get the number of streaming multiprocessors
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}
template <typename Torus>
__host__ bool
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory) {
uint64_t minimum_sm =
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
polynomial_size);
if (max_shared_memory < minimum_sm) {
// If we cannot store a single polynomial in a block shared memory we cannot
// use TBC
return false;
} else {
return cuda_check_support_thread_block_clusters();
}
}
template <typename Torus, class params>
__host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory) {
if (!cuda_check_support_thread_block_clusters() || num_samples > 128)
return false;
uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
polynomial_size);
uint64_t minimum_sm_tbc = 0;
if (supports_distributed_shared_memory_on_classic_programmable_bootstrap<
Torus>(polynomial_size, max_shared_memory))
minimum_sm_tbc =
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
polynomial_size);
int cluster_size;
dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
dim3 thds(polynomial_size / params::opt, 1, 1);
cudaLaunchConfig_t config = {0};
// The grid dimension is not affected by cluster launch, and is still
// enumerated using number of blocks. The grid dimension should be a multiple
// of cluster size.
config.gridDim = grid_accumulate;
config.blockDim = thds;
config.numAttrs = 0;
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size, device_programmable_bootstrap_tbc<Torus, params, NOSM>,
&config));
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size,
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>, &config));
} else {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size, device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
&config));
}
return cluster_size >= level_count * (glwe_dimension + 1);
}
#endif // CG_PBS_H

View File

@@ -0,0 +1,480 @@
#ifndef CUDA_TBC_MULTIBIT_PBS_CUH
#define CUDA_TBC_MULTIBIT_PBS_CUH
#include "cooperative_groups.h"
#include "crypto/gadget.cuh"
#include "crypto/ggsw.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.cuh"
#include "programmable_bootstrap.h"
#include "programmable_bootstrap_multibit.cuh"
#include "types/complex/operations.cuh"
#include <vector>
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
double2 *keybundle_array, double2 *join_buffer, Torus *global_accumulator,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t grouping_factor,
uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input, int8_t *device_mem,
uint64_t device_memory_size_per_block, bool support_dsm) {
cluster_group cluster = this_cluster();
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
if constexpr (SMD == FULLSM) {
// The first (polynomial_size/2) * sizeof(double2) bytes are reserved for
// external product using distributed shared memory
selected_memory = sharedmem;
if (support_dsm)
selected_memory += sizeof(Torus) * polynomial_size;
} else {
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
blockIdx.z * gridDim.x * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
Torus *accumulator = (Torus *)selected_memory;
double2 *accumulator_fft =
(double2 *)accumulator +
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
if constexpr (SMD == PARTIALSM) {
accumulator_fft = (double2 *)sharedmem;
if (support_dsm)
accumulator_fft += sizeof(double2) * (polynomial_size / 2);
}
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
params::degree * (glwe_dimension + 1)];
double2 *block_join_buffer =
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
params::degree / 2];
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
double2 *keybundle = keybundle_array +
// select the input
blockIdx.z * keybundle_size_per_input;
if (lwe_offset == 0) {
// Put "b" in [0, 2N[
Torus b_hat = 0;
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
2 * params::degree);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
global_slice, accumulator);
}
for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, cluster_group, params>(
accumulator, accumulator_fft, block_join_buffer, keybundle,
polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
synchronize_threads_in_block();
}
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
auto block_lwe_array_out =
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
}
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
accumulator, global_slice);
}
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // distributed shared memory
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size * 2; // accumulator
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_tbc_multi_bit_programmable_bootstrap(
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t grouping_factor, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
cudaSetDevice(stream->gpu_index);
bool supports_dsm =
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
Torus>(polynomial_size, max_shared_memory);
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
polynomial_size);
uint64_t full_sm_tbc_accumulate =
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t partial_sm_tbc_accumulate =
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t minimum_sm_tbc_accumulate = 0;
if (supports_dsm)
minimum_sm_tbc_accumulate =
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
if (max_shared_memory < full_sm_keybundle) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
if (max_shared_memory <
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
minimum_sm_tbc_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
NOSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory <
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
stream->gpu_index, input_lwe_ciphertext_count, polynomial_size,
max_shared_memory);
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
stream, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC,
allocate_gpu_memory);
}
template <typename Torus, class params>
__host__ void execute_tbc_external_product_loop(
cuda_stream_t *stream, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lwe_array_out,
Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t lwe_chunk_size, uint32_t max_shared_memory,
int lwe_offset) {
auto supports_dsm =
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
Torus>(polynomial_size, max_shared_memory);
uint64_t full_dm =
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t partial_dm =
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t minimum_dm = 0;
if (supports_dsm)
minimum_dm =
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint32_t keybundle_size_per_input =
lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2);
uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
auto d_mem = buffer->d_mem_acc_tbc;
auto keybundle_fft = buffer->keybundle_fft;
auto global_accumulator = buffer->global_accumulator;
auto buffer_fft = buffer->global_accumulator_fft;
dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
dim3 thds(polynomial_size / params::opt, 1, 1);
cudaLaunchConfig_t config = {0};
// The grid dimension is not affected by cluster launch, and is still
// enumerated using number of blocks. The grid dimension should be a multiple
// of cluster size.
config.gridDim = grid_accumulate;
config.blockDim = thds;
cudaLaunchAttribute attribute[1];
attribute[0].id = cudaLaunchAttributeClusterDimension;
attribute[0].val.clusterDim.x = level_count; // Cluster size in X-dimension
attribute[0].val.clusterDim.y = (glwe_dimension + 1);
attribute[0].val.clusterDim.z = 1;
config.attrs = attribute;
config.numAttrs = 1;
config.stream = stream->stream;
if (max_shared_memory < partial_dm + minimum_dm) {
config.dynamicSmemBytes = minimum_dm;
check_cuda_error(cudaLaunchKernelEx(
&config,
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
NOSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count, grouping_factor, lwe_offset, chunk_size,
keybundle_size_per_input, d_mem, full_dm, supports_dsm));
} else if (max_shared_memory < full_dm + minimum_dm) {
config.dynamicSmemBytes = partial_dm + minimum_dm;
check_cuda_error(cudaLaunchKernelEx(
&config,
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
PARTIALSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count, grouping_factor, lwe_offset, chunk_size,
keybundle_size_per_input, d_mem, partial_dm, supports_dsm));
} else {
config.dynamicSmemBytes = full_dm + minimum_dm;
check_cuda_error(cudaLaunchKernelEx(
&config,
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
FULLSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count, grouping_factor, lwe_offset, chunk_size,
keybundle_size_per_input, d_mem, 0, supports_dsm));
}
}
template <typename Torus, typename STorus, class params>
__host__ void host_tbc_multi_bit_programmable_bootstrap(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size = 0) {
cudaSetDevice(stream->gpu_index);
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
stream->gpu_index, num_samples, polynomial_size, max_shared_memory);
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
// Compute a keybundle
execute_compute_keybundle<Torus, params>(
stream, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, max_shared_memory,
lwe_chunk_size, lwe_offset);
// Accumulate
execute_tbc_external_product_loop<Torus, params>(
stream, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
lwe_chunk_size, max_shared_memory, lwe_offset);
}
}
template <typename Torus>
__host__ bool
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory) {
uint64_t minimum_sm =
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
if (max_shared_memory <= minimum_sm) {
// If we cannot store a single polynomial in a block shared memory we
// cannot use TBC
return false;
} else {
return cuda_check_support_thread_block_clusters();
}
}
template <typename Torus, class params>
__host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory) {
if (!cuda_check_support_thread_block_clusters())
return false;
uint64_t full_sm_tbc_accumulate =
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t partial_sm_tbc_accumulate =
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t minimum_sm_tbc_accumulate = 0;
if (supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
Torus>(polynomial_size, max_shared_memory))
minimum_sm_tbc_accumulate =
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
int cluster_size;
dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
dim3 thds(polynomial_size / params::opt, 1, 1);
cudaLaunchConfig_t config = {0};
// The grid dimension is not affected by cluster launch, and is still
// enumerated using number of blocks. The grid dimension should be a multiple
// of cluster size.
config.gridDim = grid_accumulate;
config.blockDim = thds;
config.numAttrs = 0;
if (max_shared_memory <
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
NOSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size,
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
NOSM>,
&config));
} else if (max_shared_memory <
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
PARTIALSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size,
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
PARTIALSM>,
&config));
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
FULLSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size,
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
FULLSM>,
&config));
}
return cluster_size >= level_count * (glwe_dimension + 1);
}
template __host__ bool
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<uint64_t>(
uint32_t polynomial_size, uint32_t max_shared_memory);
#endif // FASTMULTIBIT_PBS_H

View File

@@ -50,12 +50,7 @@ public:
constexpr static int opt = choose_opt_amortized(N);
constexpr static int log2_degree = log2(N);
};
enum sharedMemDegree {
NOSM = 0,
PARTIALSM = 1,
FULLSM = 2
};
enum sharedMemDegree { NOSM = 0, PARTIALSM = 1, FULLSM = 2 };
class ForwardFFT {
public:

View File

@@ -167,6 +167,38 @@ public:
}
};
#if CUDA_ARCH >= 900
BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
(benchmark::State &st) {
if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
pbs_level, cuda_get_max_shared_memory(stream->gpu_index))) {
st.SkipWithError("Configuration not supported for tbc operation");
return;
}
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_level, grouping_factor,
input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index),
true, chunk_size);
for (auto _ : st) {
// Execute PBS
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
stream, d_lwe_ct_out_array, d_lwe_output_indexes, d_lut_pbs_identity,
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
pbs_level, input_lwe_ciphertext_count, 1, 0,
cuda_get_max_shared_memory(stream->gpu_index), chunk_size);
cuda_synchronize_stream(stream);
}
cleanup_cuda_multi_bit_programmable_bootstrap(stream, &buffer);
}
#endif
BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
(benchmark::State &st) {
if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
@@ -221,6 +253,39 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
cleanup_cuda_multi_bit_programmable_bootstrap(stream, &buffer);
}
#if CUDA_ARCH >= 900
BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
(benchmark::State &st) {
if (!has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
pbs_level, cuda_get_max_shared_memory(stream->gpu_index))) {
st.SkipWithError("Configuration not supported for tbc operation");
return;
}
scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
stream, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer, glwe_dimension,
polynomial_size, pbs_level, input_lwe_ciphertext_count,
cuda_get_max_shared_memory(stream->gpu_index), true);
for (auto _ : st) {
// Execute PBS
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
stream, (uint64_t *)d_lwe_ct_out_array,
(uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
(uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
(uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
input_lwe_ciphertext_count, 1, 0,
cuda_get_max_shared_memory(stream->gpu_index));
cuda_synchronize_stream(stream);
}
cleanup_cuda_programmable_bootstrap(stream, &buffer);
}
#endif
BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
(benchmark::State &st) {
if (!has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
@@ -317,6 +382,9 @@ MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
for (auto x : params) {
for (int input_lwe_ciphertext_count = 1; input_lwe_ciphertext_count <= 4096;
input_lwe_ciphertext_count *= 2) {
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
x.grouping_factor, 0});
for (int lwe_chunk_size = 1;
lwe_chunk_size <= x.lwe_dimension / x.grouping_factor;
lwe_chunk_size *= 2)
@@ -332,6 +400,25 @@ MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
}
}
static void
CGBootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
// lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
// input_lwe_ciphertext_count
// PARAM_MESSAGE_2_CARRY_2_KS_PBS
std::vector<BootstrapBenchmarkParams> params = {
(BootstrapBenchmarkParams){742, 1, 2048, 23, 1, 1},
};
// Add to the list of parameters to benchmark
for (int num_samples = 1; num_samples <= 4096; num_samples *= 2)
for (auto x : params) {
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, num_samples});
}
}
static void
BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
@@ -351,6 +438,14 @@ BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
}
}
#if CUDA_ARCH >= 900
BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, TbcMultiBit)
->Apply(MultiBitPBSBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
"grouping_factor", "chunk_size"});
#endif
BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, CgMultiBit)
->Apply(MultiBitPBSBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
@@ -363,11 +458,23 @@ BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, DefaultMultiBit)
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
"grouping_factor", "chunk_size"});
#if CUDA_ARCH >= 900
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, TbcPBC)
->Apply(BootstrapBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
#endif
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, DefaultPBS)
->Apply(BootstrapBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, CgPBS)
->Apply(BootstrapBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, AmortizedPBS)
->Apply(BootstrapBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",

View File

@@ -1,9 +1,9 @@
#ifndef UTILS_H
#define UTILS_H
#include "tfhe.h"
#include <device.h>
#include <functional>
#include <tfhe.h>
typedef struct Seed {
uint64_t lo;

View File

@@ -205,7 +205,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
uint64_t decrypted = 0;
core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
glwe_dimension * polynomial_size);
EXPECT_NE(decrypted, plaintext);
ASSERT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
// plaintext
// - decrypted;
@@ -216,7 +216,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta);
ASSERT_EQ(decoded, plaintext / delta);
}
}
}

View File

@@ -227,7 +227,13 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
888, 1, 16384,
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
128, 3, 1, 10});
128, 3, 1, 10},
(MultiBitProgrammableBootstrapTestParams){
972, 1, 8192,
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 14, 2, 8, 8,
68, 3, 1, 1});
std::string printParamName(
::testing::TestParamInfo<MultiBitProgrammableBootstrapTestParams> p) {
MultiBitProgrammableBootstrapTestParams params = p.param;

View File

@@ -687,6 +687,37 @@ extern "C" {
pub fn cleanup_cuda_full_propagation(v_stream: *const c_void, mem_ptr: *mut *mut i8);
pub fn scratch_cuda_apply_univariate_lut_kb_64(
v_stream: *const c_void,
mem_ptr: *mut *mut i8,
input_lut: *const c_void,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: u32,
allocate_gpu_memory: bool,
);
pub fn cuda_apply_univariate_lut_kb_64(
v_stream: *const c_void,
output_radix_lwe: *mut c_void,
input_radix_lwe: *const c_void,
mem_ptr: *mut i8,
ksk: *const c_void,
bsk: *const c_void,
num_blocks: u32,
);
pub fn cleanup_cuda_apply_univariate_lut_kb_64(v_stream: *const c_void, mem_ptr: *mut *mut i8);
pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
v_stream: *const c_void,
mem_ptr: *mut *mut i8,
@@ -959,4 +990,37 @@ extern "C" {
mem_ptr: *mut *mut i8,
);
pub fn scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
v_stream: *const c_void,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: u32,
allocate_gpu_memory: bool,
);
pub fn cuda_integer_div_rem_radix_ciphertext_kb_64(
v_stream: *const c_void,
quotient: *mut c_void,
remainder: *mut c_void,
numerator: *const c_void,
divisor: *const c_void,
mem_ptr: *mut i8,
bsk: *const c_void,
ksk: *const c_void,
num_blocks: u32,
);
pub fn cleanup_cuda_integer_div_rem(v_stream: *const c_void, mem_ptr: *mut *mut i8);
} // extern "C"

View File

@@ -57,7 +57,7 @@ def check_security(filename):
estimator_level = log(min(usvp_level["rop"], dual_level["rop"]),2 )
security_level = f"security level = {estimator_level} bits"
if estimator_level < 127:
print("FAIL\t({security_level})")
print(f"FAIL\t({security_level})")
reason = f"attained {security_level} target is 128 bits"
to_update.append((param, reason))
continue

View File

@@ -8,11 +8,13 @@ function usage() {
echo "--help Print this message"
echo "--build-only Pass to only build the tests without running them"
echo "--gpu Enable GPU support"
echo "--cargo-profile The profile used to build TFHE-rs, release by default"
echo
}
BUILD_ONLY=0
WITH_FEATURE_GPU="OFF"
CARGO_PROFILE="release"
while [ -n "$1" ]
do
case "$1" in
@@ -28,6 +30,12 @@ do
"--gpu" )
WITH_FEATURE_GPU="ON"
;;
"--cargo-profile" )
shift
CARGO_PROFILE="$1"
;;
*)
echo "Unknown param : $1"
exit 1

View File

@@ -1,3 +1,5 @@
#![allow(non_local_definitions)]
use ark_ec::bls12::{Bls12, Bls12Config, TwistType};
use ark_ff::fields::*;
use ark_ff::MontFp;
@@ -576,8 +578,8 @@ pub mod g1 {
writer.write_all(&bytes)?;
} else {
let mut bytes = [0u8; 2 * G1_SERIALIZED_SIZE];
bytes[1..1 + G1_SERIALIZED_SIZE].copy_from_slice(&x_bytes[..]);
bytes[2 + G1_SERIALIZED_SIZE..].copy_from_slice(&serialize_fq(p.y)[..]);
bytes[1..G1_SERIALIZED_SIZE].copy_from_slice(&x_bytes[..]);
bytes[1 + G1_SERIALIZED_SIZE..].copy_from_slice(&serialize_fq(p.y)[..]);
encoding.encode_flags(&mut bytes);
writer.write_all(&bytes)?;

View File

@@ -91,6 +91,21 @@ pub struct PrivateCommit<G: Curve> {
__marker: PhantomData<G>,
}
pub fn compute_crs_params(
d: usize,
k: usize,
b: u64,
_q: u64, // we keep q here to make sure the API is consistent with [crs_gen]
t: u64,
) -> (usize, usize, u64) {
let b_r = d as u64 / 2 + 1;
let big_d =
d + k * t.ilog2() as usize + (d + k) * (2 + b.ilog2() as usize + b_r.ilog2() as usize);
let n = big_d + 1;
(n, big_d, b_r)
}
pub fn crs_gen<G: Curve>(
d: usize,
k: usize,
@@ -100,11 +115,7 @@ pub fn crs_gen<G: Curve>(
rng: &mut dyn RngCore,
) -> PublicParams<G> {
let alpha = G::Zp::rand(rng);
let b_r = d as u64 / 2 + 1;
let big_d =
d + k * t.ilog2() as usize + (d + k) * (2 + b.ilog2() as usize + b_r.ilog2() as usize);
let n = big_d + 1;
let (n, big_d, b_r) = compute_crs_params(d, k, b, q, t);
PublicParams {
g_lists: GroupElements::<G>::new(n, alpha),
big_d,

View File

@@ -11,6 +11,7 @@ use tfhe::keycache::NamedParam;
use tfhe::shortint::parameters::{
PARAM_GPU_MULTI_BIT_MESSAGE_1_CARRY_1_GROUP_3_KS_PBS,
PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS,
PARAM_GPU_MULTI_BIT_MESSAGE_3_CARRY_3_GROUP_3_KS_PBS,
};
#[cfg(not(feature = "gpu"))]
use tfhe::shortint::parameters::{
@@ -42,9 +43,10 @@ const SHORTINT_MULTI_BIT_BENCH_PARAMS: [MultiBitPBSParameters; 6] = [
];
#[cfg(feature = "gpu")]
const SHORTINT_MULTI_BIT_BENCH_PARAMS: [MultiBitPBSParameters; 2] = [
const SHORTINT_MULTI_BIT_BENCH_PARAMS: [MultiBitPBSParameters; 3] = [
PARAM_GPU_MULTI_BIT_MESSAGE_1_CARRY_1_GROUP_3_KS_PBS,
PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS,
PARAM_GPU_MULTI_BIT_MESSAGE_3_CARRY_3_GROUP_3_KS_PBS,
];
const BOOLEAN_BENCH_PARAMS: [(&str, BooleanParameters); 2] = [

View File

@@ -39,9 +39,9 @@ impl Default for ParamsAndNumBlocksIter {
if env_config.is_multi_bit {
#[cfg(feature = "gpu")]
let params = vec![PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS.into()];
let params = vec![PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS.into()];
#[cfg(not(feature = "gpu"))]
let params = vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS.into()];
let params = vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS.into()];
let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
Self {
@@ -1538,6 +1538,11 @@ mod cuda {
display_name: mul
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: unchecked_div_rem,
display_name: div_rem
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: unchecked_add,
display_name: add
@@ -1720,6 +1725,11 @@ mod cuda {
display_name: mul
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: div_rem,
display_name: div_rem
);
define_cuda_server_key_bench_clean_input_fn!(
method_name: ne,
display_name: not_equal
@@ -1898,6 +1908,7 @@ mod cuda {
cuda_unchecked_bitor,
cuda_unchecked_bitxor,
cuda_unchecked_mul,
cuda_unchecked_div_rem,
cuda_unchecked_sub,
cuda_unchecked_unsigned_overflowing_sub,
cuda_unchecked_add,
@@ -1938,6 +1949,7 @@ mod cuda {
cuda_unsigned_overflowing_sub,
cuda_add,
cuda_mul,
cuda_div_rem,
cuda_eq,
cuda_ne,
cuda_ge,
@@ -1982,7 +1994,7 @@ mod cuda {
display_name: &str,
cast_op: F,
) where
F: Fn(&CudaServerKey, CudaUnsignedRadixCiphertext, usize),
F: Fn(&CudaServerKey, CudaUnsignedRadixCiphertext, usize, &CudaStream),
{
let mut bench_group = c.benchmark_group(bench_name);
bench_group
@@ -2021,7 +2033,7 @@ mod cuda {
b.iter_batched(
encrypt_one_value,
|ct| {
cast_op(&gpu_sks, ct, target_num_blocks);
cast_op(&gpu_sks, ct, target_num_blocks, &stream);
},
criterion::BatchSize::SmallInput,
)
@@ -2046,12 +2058,12 @@ mod cuda {
(method_name: $server_key_method:ident, display_name:$name:ident) => {
::paste::paste!{
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
bench_server_key_cast_function(
cuda_bench_server_key_cast_function(
c,
concat!("integer::cuda::", stringify!($server_key_method)),
stringify!($name),
|server_key, lhs, rhs| {
server_key.$server_key_method(lhs, rhs);
|server_key, lhs, rhs, stream| {
server_key.$server_key_method(lhs, rhs, stream);
})
}
}
@@ -2406,7 +2418,7 @@ fn go_through_gpu_bench_groups(val: &str) {
"default" => {
default_cuda_ops();
default_scalar_cuda_ops();
cuda_cast_ops()
cuda_cast_ops();
}
"unchecked" => {
unchecked_cuda_ops();

View File

@@ -826,7 +826,7 @@ macro_rules! define_server_key_bench_binary_scalar_clean_inputs_fn (
fn $server_key_method(c: &mut Criterion) {
bench_server_key_binary_scalar_function_clean_inputs(
c,
concat!("integer::", stringify!($server_key_method)),
concat!("integer::signed::", stringify!($server_key_method)),
stringify!($name),
|server_key, lhs, rhs| {
server_key.$server_key_method(lhs, rhs);
@@ -2147,11 +2147,106 @@ mod cuda {
cuda_scalar_min,
cuda_scalar_max,
);
fn cuda_bench_server_key_signed_cast_function<F>(
c: &mut Criterion,
bench_name: &str,
display_name: &str,
cast_op: F,
) where
F: Fn(&CudaServerKey, CudaSignedRadixCiphertext, usize, &CudaStream),
{
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(30));
let mut rng = rand::thread_rng();
let env_config = EnvConfig::new();
let gpu_index = 0;
let device = CudaDevice::new(gpu_index);
let stream = CudaStream::new_unchecked(device);
for (param, num_blocks, bit_size) in ParamsAndNumBlocksIter::default() {
let all_num_blocks = env_config
.bit_sizes()
.iter()
.copied()
.map(|bit| bit.div_ceil(param.message_modulus().0.ilog2() as usize))
.collect::<Vec<_>>();
let param_name = param.name();
for target_num_blocks in all_num_blocks.iter().copied() {
let target_bit_size =
target_num_blocks * param.message_modulus().0.ilog2() as usize;
let bench_id =
format!("{bench_name}::{param_name}::{bit_size}_to_{target_bit_size}");
bench_group.bench_function(&bench_id, |b| {
let (cks, _sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
let encrypt_one_value = || -> CudaSignedRadixCiphertext {
let ct = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_blocks);
CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct, &stream)
};
b.iter_batched(
encrypt_one_value,
|ct| {
cast_op(&gpu_sks, ct, target_num_blocks, &stream);
},
criterion::BatchSize::SmallInput,
)
});
write_to_json::<u64, _>(
&bench_id,
param,
param.name(),
display_name,
&OperatorType::Atomic,
bit_size as u32,
vec![param.message_modulus().0.ilog2(); num_blocks],
);
}
}
bench_group.finish()
}
macro_rules! define_cuda_server_key_bench_signed_cast_fn (
(method_name: $server_key_method:ident, display_name:$name:ident) => {
::paste::paste!{
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
cuda_bench_server_key_signed_cast_function(
c,
concat!("integer::cuda::signed::", stringify!($server_key_method)),
stringify!($name),
|server_key, lhs, rhs, stream| {
server_key.$server_key_method(lhs, rhs, stream);
})
}
}
}
);
define_cuda_server_key_bench_signed_cast_fn!(
method_name: cast_to_unsigned,
display_name: cast_to_unsigned
);
define_cuda_server_key_bench_signed_cast_fn!(
method_name: cast_to_signed,
display_name: cast_to_signed
);
criterion_group!(cuda_cast_ops, cuda_cast_to_unsigned, cuda_cast_to_signed);
}
#[cfg(feature = "gpu")]
use cuda::{
default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops, unchecked_scalar_cuda_ops,
cuda_cast_ops, default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops,
unchecked_scalar_cuda_ops,
};
#[cfg(feature = "gpu")]
@@ -2160,6 +2255,7 @@ fn go_through_gpu_bench_groups(val: &str) {
"default" => {
default_cuda_ops();
default_scalar_cuda_ops();
cuda_cast_ops();
}
"unchecked" => {
unchecked_cuda_ops();

View File

@@ -1,8 +1,8 @@
// If this test break the c_api doc needs to be updated
#include "tfhe.h"
#include <assert.h>
#include <stdio.h>
#include <tfhe.h>
int main(void) {
int ok = 0;

View File

@@ -1,4 +1,4 @@
#include <tfhe.h>
#include "tfhe.h"
#include <assert.h>
#include <inttypes.h>

View File

@@ -1,4 +1,4 @@
#include <tfhe.h>
#include "tfhe.h"
#include <assert.h>
#include <inttypes.h>

View File

@@ -1,4 +1,4 @@
#include <tfhe.h>
#include "tfhe.h"
#include <assert.h>
#include <inttypes.h>

View File

@@ -1,4 +1,4 @@
#include <tfhe.h>
#include "tfhe.h"
#include <assert.h>
#include <inttypes.h>

View File

@@ -1,4 +1,4 @@
#include <tfhe.h>
#include "tfhe.h"
#include <assert.h>
#include <inttypes.h>

View File

@@ -1,5 +1,5 @@
#if defined(WITH_FEATURE_GPU)
#include <tfhe.h>
#include "tfhe.h"
#include <assert.h>
#include <inttypes.h>

View File

@@ -24,7 +24,7 @@
* - NUM_CPU: number of CPU threads (tfhe internally automatically creates them)
*/
#include <tfhe.h>
#include "tfhe.h"
#include <assert.h>
#include <pthread.h>

View File

@@ -1,7 +1,6 @@
#include "tfhe.h"
#include <assert.h>
#include <stdlib.h>
#include <tfhe.h>
int main(void) {
// We want to use zk-proof, which requires bounded random distributions

View File

@@ -4,7 +4,7 @@ The first step is the creation of the configuration. The configuration is used t
A configuration can be created by using the ConfigBuilder type.
In this example, 8-bit unsigned integers with default parameters are used. The `integers` feature must also be enabled, as per the table on [this page](broken-reference).
In this example, 8-bit unsigned integers with default parameters are used. The `integers` feature must also be enabled, as per the table on [this page](../guides/rust\_configuration.md#homomorphic-types).
The config is generated by first creating a builder with all types deactivated. Then, the integer types with default parameters are activated, since we are going to use FheUint8 values.

View File

@@ -3,7 +3,7 @@
The basic steps for using the high-level API of TFHE-rs are:
1. [Importing the TFHE-rs prelude](quick\_start.md#imports);
2. Client-side: [Configuring and creating keys](../fundamentals/configure-and-create-keys.md);
2. Client-side: [Configuring and creating keys](../fundamentals/configure-and-generate-keys.md);
3. Client-side: [Encrypting data](../fundamentals/encrypt-data.md);
4. Server-side: [Setting the server key](../fundamentals/set-the-server-key.md);
5. Server-side: [Computing over encrypted data](../fundamentals/compute.md);

View File

@@ -79,7 +79,7 @@ $
```c
#include <tfhe.h>
#include "tfhe.h"
#include <assert.h>
#include <stdio.h>

View File

@@ -35,8 +35,7 @@ function fhe_uint32_example() {
init_panic_hook();
const block_params = new ShortintParameters(ShortintParametersName.PARAM_SMALL_MESSAGE_2_CARRY_2_COMPACT_PK);
let config = TfheConfigBuilder.all_disabled()
.enable_default_integers()
let config = TfheConfigBuilder.default()
.build();
let clientKey = TfheClientKey.generate(config);

View File

@@ -201,7 +201,7 @@ All operations follow the same syntax than the one described in [here](../gettin
# Benchmarks
All GPU benchmarks presented here were obtained on a single H100 GPU, and rely on the multithreaded PBS algorithm.
The cryptographic parameters PARAM\_GPU\_MULTI\_BIT\_MESSAGE\_1\_CARRY\_2\_GROUP\_3\_KS\_PBS were used.
The cryptographic parameters PARAM\_GPU\_MULTI\_BIT\_MESSAGE\_2\_CARRY\_2\_GROUP\_3\_KS\_PBS were used.
Performance is the following when the inputs of the benchmarked operation are encrypted:
| Operation \ Size | `FheUint7` | `FheUint16` | `FheUint32` | `FheUint64` | `FheUint128` | `FheUint256` |

View File

@@ -170,10 +170,10 @@ pub unsafe extern "C" fn core_crypto_lwe_encrypt(
let seed = (seed_high_bytes << 64) | seed_low_bytes;
let seed = Seed(seed);
let mut determinisitic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
let mut deterministic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
let mut encryption_generator = EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
determinisitic_seeder.seed(),
&mut determinisitic_seeder,
deterministic_seeder.seed(),
&mut deterministic_seeder,
);
let plaintext = Plaintext(pt);
@@ -219,10 +219,10 @@ pub unsafe extern "C" fn core_crypto_ggsw_encrypt(
let seed = (seed_high_bytes << 64) | seed_low_bytes;
let seed = Seed(seed);
let mut determinisitic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
let mut deterministic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
let mut encryption_generator = EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
determinisitic_seeder.seed(),
&mut determinisitic_seeder,
deterministic_seeder.seed(),
&mut deterministic_seeder,
);
let plaintext = Plaintext(pt);

View File

@@ -3246,7 +3246,12 @@ mod test {
let mut thread_rng = rand::thread_rng();
for _ in 0..10_000 {
#[cfg(not(tarpaulin))]
let nb_tests = 10_000;
#[cfg(tarpaulin)]
let nb_tests = 1;
for _ in 0..nb_tests {
let lwe_sk =
LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_random_generator);
@@ -3305,7 +3310,12 @@ mod test {
let mut thread_rng = rand::thread_rng();
for _ in 0..100 {
#[cfg(not(tarpaulin))]
let nb_tests = 100;
#[cfg(tarpaulin)]
let nb_tests = 1;
for _ in 0..nb_tests {
// We'll encrypt between 1 and 4 * lwe_dimension ciphertexts
let ct_count: usize = thread_rng.gen();
let ct_count = ct_count % (lwe_dimension.0 * 4) + 1;

View File

@@ -206,7 +206,10 @@ where
mod test {
use super::*;
#[cfg(not(tarpaulin))]
const NB_TESTS: usize = 1_000_000_000;
#[cfg(tarpaulin)]
const NB_TESTS: usize = 1;
#[test]
fn test_divide_funcs() {
@@ -295,7 +298,7 @@ mod test {
use rand::Rng;
let mut rng = rand::thread_rng();
for _ in 0..1_000_000_000 {
for _ in 0..NB_TESTS {
let value: u64 = rng.gen();
// This is an easy case where we expect the values to match exactly, to cover other
// cases we would be re coding the algorithms here.

View File

@@ -993,7 +993,10 @@ fn lwe_compact_public_encrypt_decrypt_custom_mod<Scalar: UnsignedTorus>(
}
create_parametrized_test!(lwe_compact_public_encrypt_decrypt_custom_mod {
TEST_PARAMS_4_BITS_NATIVE_U64
#[cfg(not(tarpaulin))]
TEST_PARAMS_4_BITS_NATIVE_U64,
#[cfg(tarpaulin)]
COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U64
});
#[cfg(feature = "zk-pok-experimental")]
@@ -1095,7 +1098,10 @@ fn lwe_compact_public_encrypt_prove_verify_decrypt_custom_mod<Scalar>(
#[cfg(feature = "zk-pok-experimental")]
create_parametrized_test!(lwe_compact_public_encrypt_prove_verify_decrypt_custom_mod {
TEST_PARAMS_4_BITS_NATIVE_U64
#[cfg(not(tarpaulin))]
TEST_PARAMS_4_BITS_NATIVE_U64,
#[cfg(tarpaulin)]
COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U64
});
#[cfg(feature = "zk-pok-experimental")]
@@ -1316,3 +1322,219 @@ fn test_par_compact_lwe_list_public_key_encryption_and_proof() {
assert_eq!(ser_lwe_ct_list, par_lwe_ct_list);
}
}
#[test]
fn test_compact_public_key_encryption() {
use rand::Rng;
#[cfg(not(tarpaulin))]
let nb_tests = 10_000;
#[cfg(tarpaulin)]
let nb_tests = 100;
#[cfg(not(tarpaulin))]
let lwe_dimension = LweDimension(2048);
#[cfg(tarpaulin)]
let lwe_dimension = LweDimension(1);
let glwe_noise_distribution =
Gaussian::from_dispersion_parameter(StandardDev(0.00000000000000029403601535432533), 0.0);
let ciphertext_modulus = CiphertextModulus::new_native();
let mut secret_random_generator = test_tools::new_secret_random_generator();
let mut encryption_random_generator = test_tools::new_encryption_random_generator();
let mut thread_rng = rand::thread_rng();
for _ in 0..nb_tests {
let lwe_sk = LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_random_generator);
let mut compact_lwe_pk = LweCompactPublicKey::new(0u64, lwe_dimension, ciphertext_modulus);
generate_lwe_compact_public_key(
&lwe_sk,
&mut compact_lwe_pk,
glwe_noise_distribution,
&mut encryption_random_generator,
);
let msg: u64 = thread_rng.gen();
let msg = msg % 16;
let plaintext = Plaintext(msg << 60);
let mut output_ct = LweCiphertext::new(
0u64,
lwe_dimension.to_lwe_size(),
CiphertextModulus::new_native(),
);
encrypt_lwe_ciphertext_with_compact_public_key(
&compact_lwe_pk,
&mut output_ct,
plaintext,
glwe_noise_distribution,
glwe_noise_distribution,
&mut secret_random_generator,
&mut encryption_random_generator,
);
let decrypted_plaintext = decrypt_lwe_ciphertext(&lwe_sk, &output_ct);
let signed_decomposer =
SignedDecomposer::new(DecompositionBaseLog(4), DecompositionLevelCount(1));
let cleartext = signed_decomposer.closest_representable(decrypted_plaintext.0) >> 60;
assert_eq!(cleartext, msg);
}
}
#[test]
fn test_par_compact_lwe_list_public_key_encryption_equivalence() {
use rand::Rng;
let lwe_dimension = LweDimension(2048);
let glwe_noise_distribution =
Gaussian::from_dispersion_parameter(StandardDev(0.00000000000000029403601535432533), 0.0);
let ciphertext_modulus = CiphertextModulus::new_native();
let mut thread_rng = rand::thread_rng();
for _ in 0..NB_TESTS {
// We'll encrypt between 1 and 4 * lwe_dimension ciphertexts
let ct_count: usize = thread_rng.gen();
let ct_count = ct_count % (lwe_dimension.0 * 4) + 1;
let lwe_ciphertext_count = LweCiphertextCount(ct_count);
let seed = test_tools::random_seed();
let mut input_plaintext_list =
PlaintextList::new(0u64, PlaintextCount(lwe_ciphertext_count.0));
input_plaintext_list.iter_mut().for_each(|x| {
let msg: u64 = thread_rng.gen();
*x.0 = (msg % 16) << 60;
});
let par_lwe_ct_list = {
let mut deterministic_seeder =
DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
let mut secret_random_generator =
SecretRandomGenerator::<ActivatedRandomGenerator>::new(deterministic_seeder.seed());
let mut encryption_random_generator =
EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
deterministic_seeder.seed(),
&mut deterministic_seeder,
);
let lwe_sk =
LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_random_generator);
let mut compact_lwe_pk =
LweCompactPublicKey::new(0u64, lwe_dimension, ciphertext_modulus);
generate_lwe_compact_public_key(
&lwe_sk,
&mut compact_lwe_pk,
glwe_noise_distribution,
&mut encryption_random_generator,
);
let mut output_compact_ct_list = LweCompactCiphertextList::new(
0u64,
lwe_dimension.to_lwe_size(),
lwe_ciphertext_count,
ciphertext_modulus,
);
par_encrypt_lwe_compact_ciphertext_list_with_compact_public_key(
&compact_lwe_pk,
&mut output_compact_ct_list,
&input_plaintext_list,
glwe_noise_distribution,
glwe_noise_distribution,
&mut secret_random_generator,
&mut encryption_random_generator,
);
let mut output_plaintext_list = input_plaintext_list.clone();
output_plaintext_list.as_mut().fill(0u64);
let lwe_ciphertext_list = output_compact_ct_list.par_expand_into_lwe_ciphertext_list();
decrypt_lwe_ciphertext_list(&lwe_sk, &lwe_ciphertext_list, &mut output_plaintext_list);
let signed_decomposer =
SignedDecomposer::new(DecompositionBaseLog(4), DecompositionLevelCount(1));
output_plaintext_list
.iter_mut()
.for_each(|x| *x.0 = signed_decomposer.closest_representable(*x.0));
assert_eq!(input_plaintext_list, output_plaintext_list);
lwe_ciphertext_list
};
let ser_lwe_ct_list = {
let mut deterministic_seeder =
DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
let mut secret_random_generator =
SecretRandomGenerator::<ActivatedRandomGenerator>::new(deterministic_seeder.seed());
let mut encryption_random_generator =
EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
deterministic_seeder.seed(),
&mut deterministic_seeder,
);
let lwe_sk =
LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_random_generator);
let mut compact_lwe_pk =
LweCompactPublicKey::new(0u64, lwe_dimension, ciphertext_modulus);
generate_lwe_compact_public_key(
&lwe_sk,
&mut compact_lwe_pk,
glwe_noise_distribution,
&mut encryption_random_generator,
);
let mut output_compact_ct_list = LweCompactCiphertextList::new(
0u64,
lwe_dimension.to_lwe_size(),
lwe_ciphertext_count,
ciphertext_modulus,
);
encrypt_lwe_compact_ciphertext_list_with_compact_public_key(
&compact_lwe_pk,
&mut output_compact_ct_list,
&input_plaintext_list,
glwe_noise_distribution,
glwe_noise_distribution,
&mut secret_random_generator,
&mut encryption_random_generator,
);
let mut output_plaintext_list = input_plaintext_list.clone();
output_plaintext_list.as_mut().fill(0u64);
let lwe_ciphertext_list = output_compact_ct_list.expand_into_lwe_ciphertext_list();
decrypt_lwe_ciphertext_list(&lwe_sk, &lwe_ciphertext_list, &mut output_plaintext_list);
let signed_decomposer =
SignedDecomposer::new(DecompositionBaseLog(4), DecompositionLevelCount(1));
output_plaintext_list
.iter_mut()
.for_each(|x| *x.0 = signed_decomposer.closest_representable(*x.0));
assert_eq!(input_plaintext_list, output_plaintext_list);
lwe_ciphertext_list
};
assert_eq!(ser_lwe_ct_list, par_lwe_ct_list);
}
}

View File

@@ -212,7 +212,7 @@ fn test_lwe_encrypt_ks_switch_mod_decrypt_custom_mod() {
// In coverage, we break after one while loop iteration, changing message values does not
// yield higher coverage
#[cfg(feature = "__coverage")]
#[cfg(tarpaulin)]
break;
}
}

View File

@@ -12,10 +12,18 @@ fn test_seeded_lwe_ksk_gen_equivalence<Scalar: UnsignedTorus + Send + Sync>(
// DISCLAIMER: these toy example parameters are not guaranteed to be secure or yield correct
// computations
// Define parameters for LweKeyswitchKey creation
#[cfg(not(tarpaulin))]
let input_lwe_dimension = LweDimension(742);
#[cfg(tarpaulin)]
let input_lwe_dimension = LweDimension(1);
#[cfg(not(tarpaulin))]
let output_lwe_dimension = LweDimension(2048);
#[cfg(tarpaulin)]
let output_lwe_dimension = LweDimension(32);
let lwe_noise_distribution =
DynamicDistribution::new_gaussian_from_std_dev(StandardDev(0.000007069849454709433));
let output_lwe_dimension = LweDimension(2048);
let decomp_base_log = DecompositionBaseLog(3);
let decomp_level_count = DecompositionLevelCount(5);

View File

@@ -535,11 +535,13 @@ fn std_lwe_encrypt_multi_bit_deterministic_pbs_decrypt_custom_mod<Scalar>(
#[test]
pub fn test_lwe_encrypt_multi_bit_pbs_decrypt_factor_2_thread_5_native_mod() {
lwe_encrypt_multi_bit_pbs_decrypt_custom_mod::<u64>(MULTI_BIT_2_2_2_PARAMS);
lwe_encrypt_multi_bit_pbs_decrypt_custom_mod::<u64>(MULTI_BIT_3_3_2_PARAMS);
}
#[test]
pub fn test_lwe_encrypt_multi_bit_pbs_decrypt_factor_3_thread_12_native_mod() {
lwe_encrypt_multi_bit_pbs_decrypt_custom_mod::<u64>(MULTI_BIT_2_2_3_PARAMS);
lwe_encrypt_multi_bit_pbs_decrypt_custom_mod::<u64>(MULTI_BIT_3_3_3_PARAMS);
}
#[test]

View File

@@ -12,9 +12,17 @@ fn test_seeded_lwe_pksk_gen_equivalence<Scalar: UnsignedTorus>(
// DISCLAIMER: these toy example parameters are not guaranteed to be secure or yield correct
// computations
// Define parameters for LweKeyswitchKey creation
#[cfg(not(tarpaulin))]
let input_lwe_dimension = LweDimension(742);
let output_glwe_dimension = GlweDimension(1);
#[cfg(tarpaulin)]
let input_lwe_dimension = LweDimension(1);
#[cfg(not(tarpaulin))]
let output_polynomial_size = PolynomialSize(2048);
#[cfg(tarpaulin)]
let output_polynomial_size = PolynomialSize(32);
let output_glwe_dimension = GlweDimension(1);
let glwe_noise_distribution = DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000000000029403601535432533,
));

View File

@@ -398,6 +398,32 @@ pub const TEST_PARAMS_4_BITS_NATIVE_U128: ClassicTestParams<u128> = ClassicTestP
ciphertext_modulus: CiphertextModulus::new_native(),
};
#[cfg(tarpaulin)]
pub const COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U128: ClassicTestParams<u128> = ClassicTestParams {
lwe_dimension: LweDimension(1),
glwe_dimension: GlweDimension(1),
polynomial_size: PolynomialSize(64),
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
4.9982771e-11,
)),
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
8.6457178e-32,
)),
pbs_base_log: DecompositionBaseLog(23),
pbs_level: DecompositionLevelCount(1),
ks_level: DecompositionLevelCount(5),
ks_base_log: DecompositionBaseLog(3),
pfks_level: DecompositionLevelCount(1),
pfks_base_log: DecompositionBaseLog(23),
pfks_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000000000029403601535432533,
)),
cbs_level: DecompositionLevelCount(0),
cbs_base_log: DecompositionBaseLog(0),
message_modulus_log: MessageModulusLog(4),
ciphertext_modulus: CiphertextModulus::new_native(),
};
pub const TEST_PARAMS_3_BITS_127_U128: ClassicTestParams<u128> = ClassicTestParams {
lwe_dimension: LweDimension(742),
glwe_dimension: GlweDimension(1),
@@ -423,6 +449,32 @@ pub const TEST_PARAMS_3_BITS_127_U128: ClassicTestParams<u128> = ClassicTestPara
ciphertext_modulus: CiphertextModulus::new(1 << 127),
};
#[cfg(tarpaulin)]
pub const COVERAGE_TEST_PARAMS_3_BITS_127_U128: ClassicTestParams<u128> = ClassicTestParams {
lwe_dimension: LweDimension(1),
glwe_dimension: GlweDimension(1),
polynomial_size: PolynomialSize(64),
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
4.9982771e-11,
)),
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
8.6457178e-32,
)),
pbs_base_log: DecompositionBaseLog(23),
pbs_level: DecompositionLevelCount(1),
ks_level: DecompositionLevelCount(5),
ks_base_log: DecompositionBaseLog(3),
pfks_level: DecompositionLevelCount(1),
pfks_base_log: DecompositionBaseLog(23),
pfks_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000000000029403601535432533,
)),
cbs_level: DecompositionLevelCount(0),
cbs_base_log: DecompositionBaseLog(0),
message_modulus_log: MessageModulusLog(3),
ciphertext_modulus: CiphertextModulus::new(1 << 127),
};
fn lwe_encrypt_pbs_f128_decrypt_custom_mod<Scalar>(params: ClassicTestParams<Scalar>)
where
Scalar: UnsignedTorus
@@ -537,9 +589,15 @@ where
#[test]
fn lwe_encrypt_pbs_f128_decrypt_custom_mod_test_params_4_bits_native_u128() {
#[cfg(not(tarpaulin))]
lwe_encrypt_pbs_f128_decrypt_custom_mod(TEST_PARAMS_4_BITS_NATIVE_U128);
#[cfg(tarpaulin)]
lwe_encrypt_pbs_f128_decrypt_custom_mod(COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U128);
}
#[test]
fn lwe_encrypt_pbs_f128_decrypt_custom_mod_test_params_3_bits_127_u128() {
#[cfg(not(tarpaulin))]
lwe_encrypt_pbs_f128_decrypt_custom_mod(TEST_PARAMS_3_BITS_127_U128);
#[cfg(tarpaulin)]
lwe_encrypt_pbs_f128_decrypt_custom_mod(COVERAGE_TEST_PARAMS_3_BITS_127_U128);
}

View File

@@ -24,6 +24,7 @@ mod lwe_packing_keyswitch_key_generation;
mod lwe_private_functional_packing_keyswitch;
pub(crate) mod lwe_programmable_bootstrapping;
mod modulus_switch_compression;
#[cfg(not(tarpaulin))]
mod noise_distribution;
pub struct TestResources {
@@ -78,6 +79,32 @@ pub const TEST_PARAMS_4_BITS_NATIVE_U64: ClassicTestParams<u64> = ClassicTestPar
ciphertext_modulus: CiphertextModulus::new_native(),
};
#[cfg(tarpaulin)]
pub const COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U64: ClassicTestParams<u64> = ClassicTestParams {
lwe_dimension: LweDimension(1),
glwe_dimension: GlweDimension(1),
polynomial_size: PolynomialSize(32),
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.000007069849454709433,
)),
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000000000029403601535432533,
)),
pbs_base_log: DecompositionBaseLog(23),
pbs_level: DecompositionLevelCount(1),
ks_level: DecompositionLevelCount(5),
ks_base_log: DecompositionBaseLog(3),
pfks_level: DecompositionLevelCount(1),
pfks_base_log: DecompositionBaseLog(23),
pfks_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000000000029403601535432533,
)),
cbs_level: DecompositionLevelCount(0),
cbs_base_log: DecompositionBaseLog(0),
message_modulus_log: MessageModulusLog(4),
ciphertext_modulus: CiphertextModulus::new_native(),
};
pub const TEST_PARAMS_3_BITS_63_U64: ClassicTestParams<u64> = ClassicTestParams {
lwe_dimension: LweDimension(742),
glwe_dimension: GlweDimension(1),
@@ -103,6 +130,32 @@ pub const TEST_PARAMS_3_BITS_63_U64: ClassicTestParams<u64> = ClassicTestParams
ciphertext_modulus: CiphertextModulus::new(1 << 63),
};
#[cfg(tarpaulin)]
pub const COVERAGE_TEST_PARAMS_3_BITS_63_U64: ClassicTestParams<u64> = ClassicTestParams {
lwe_dimension: LweDimension(1),
glwe_dimension: GlweDimension(1),
polynomial_size: PolynomialSize(32),
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.000007069849454709433,
)),
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000000000029403601535432533,
)),
pbs_base_log: DecompositionBaseLog(23),
pbs_level: DecompositionLevelCount(1),
ks_level: DecompositionLevelCount(5),
ks_base_log: DecompositionBaseLog(3),
pfks_level: DecompositionLevelCount(1),
pfks_base_log: DecompositionBaseLog(23),
pfks_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000000000029403601535432533,
)),
cbs_level: DecompositionLevelCount(0),
cbs_base_log: DecompositionBaseLog(0),
message_modulus_log: MessageModulusLog(3),
ciphertext_modulus: CiphertextModulus::new(1 << 63),
};
pub const TEST_PARAMS_3_BITS_SOLINAS_U64: ClassicTestParams<u64> = ClassicTestParams {
lwe_dimension: LweDimension(742),
glwe_dimension: GlweDimension(1),
@@ -196,6 +249,24 @@ pub const MULTI_BIT_2_2_2_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
thread_count: ThreadCount(5),
};
pub const MULTI_BIT_3_3_2_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
input_lwe_dimension: LweDimension(922),
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.0000003272369292345697,
)),
decomp_base_log: DecompositionBaseLog(14),
decomp_level_count: DecompositionLevelCount(2),
glwe_dimension: GlweDimension(1),
polynomial_size: PolynomialSize(8192),
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.0000000000000000002168404344971009,
)),
message_modulus_log: MessageModulusLog(6),
ciphertext_modulus: CiphertextModulus::new_native(),
grouping_factor: LweBskGroupingFactor(2),
thread_count: ThreadCount(5),
};
pub const MULTI_BIT_2_2_2_CUSTOM_MOD_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
input_lwe_dimension: LweDimension(818),
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
@@ -232,6 +303,24 @@ pub const MULTI_BIT_2_2_3_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
thread_count: ThreadCount(12),
};
pub const MULTI_BIT_3_3_3_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
input_lwe_dimension: LweDimension(972),
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000013016688349592805,
)),
decomp_base_log: DecompositionBaseLog(14),
decomp_level_count: DecompositionLevelCount(2),
glwe_dimension: GlweDimension(1),
polynomial_size: PolynomialSize(8192),
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.0000000000000000002168404344971009,
)),
message_modulus_log: MessageModulusLog(6),
ciphertext_modulus: CiphertextModulus::new_native(),
grouping_factor: LweBskGroupingFactor(3),
thread_count: ThreadCount(5),
};
pub const MULTI_BIT_2_2_3_CUSTOM_MOD_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
input_lwe_dimension: LweDimension(888),
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
@@ -311,6 +400,70 @@ pub const FFT128_U128_PARAMS: FftTestParams<u128> = FftTestParams {
ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
};
// DISCLAIMER: example parameters tailored for coverage tests. There are not guaranteed
// to be secure or yield correct computations.
#[cfg(tarpaulin)]
pub const COVERAGE_FFT_U32_PARAMS: FftTestParams<u32> = FftTestParams {
lwe_dimension: LweDimension(1),
glwe_dimension: GlweDimension(1),
polynomial_size: PolynomialSize(64),
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000004998277131225527,
)),
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000000000000000000000000008645717832544903,
)),
pbs_base_log: DecompositionBaseLog(23),
pbs_level: DecompositionLevelCount(1),
ciphertext_modulus: CiphertextModulus::new_native(),
};
#[cfg(tarpaulin)]
pub const COVERAGE_FFT_U64_PARAMS: FftTestParams<u64> = FftTestParams {
lwe_dimension: LweDimension(1),
glwe_dimension: GlweDimension(1),
polynomial_size: PolynomialSize(64),
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000004998277131225527,
)),
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000000000000000000000000008645717832544903,
)),
pbs_base_log: DecompositionBaseLog(23),
pbs_level: DecompositionLevelCount(1),
ciphertext_modulus: CiphertextModulus::new_native(),
};
#[cfg(tarpaulin)]
pub const COVERAGE_FFT_U128_PARAMS: FftTestParams<u128> = FftTestParams {
lwe_dimension: LweDimension(1),
glwe_dimension: GlweDimension(1),
polynomial_size: PolynomialSize(64),
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000004998277131225527,
)),
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000000000000000000000000008645717832544903,
)),
pbs_base_log: DecompositionBaseLog(23),
pbs_level: DecompositionLevelCount(1),
ciphertext_modulus: CiphertextModulus::new_native(),
};
#[cfg(tarpaulin)]
pub const COVERAGE_FFT128_U128_PARAMS: FftTestParams<u128> = FftTestParams {
lwe_dimension: LweDimension(1),
glwe_dimension: GlweDimension(1),
polynomial_size: PolynomialSize(64),
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(0.12345)),
glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
0.00000000000000000000000000000008645717832544903,
)),
pbs_base_log: DecompositionBaseLog(23),
pbs_level: DecompositionLevelCount(1),
ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
};
pub const FFT_WOPBS_PARAMS: FftWopPbsTestParams<u64> = FftWopPbsTestParams {
lwe_dimension: LweDimension(481),
glwe_dimension: GlweDimension(1),
@@ -480,21 +633,33 @@ pub(crate) fn gen_keys_or_get_from_cache_if_enabled<
// Macro to generate tests for all parameter sets
macro_rules! create_parametrized_test{
($name:ident { $($param:ident),* $(,)? }) => {
(
$name:ident {
$($(#[$cfg:meta])* $param:ident),*
$(,)?
}
) => {
::paste::paste! {
$(
#[test]
fn [<test_ $name _ $param:lower>]() {
$name($param)
}
#[test]
$(#[$cfg])*
fn [<test_ $name _ $param:lower>]() {
$name($param)
}
)*
}
};
($name:ident)=> {
create_parametrized_test!($name
{
#[cfg(not(tarpaulin))]
TEST_PARAMS_4_BITS_NATIVE_U64,
TEST_PARAMS_3_BITS_63_U64
#[cfg(not(tarpaulin))]
TEST_PARAMS_3_BITS_63_U64,
#[cfg(tarpaulin)]
COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U64,
#[cfg(tarpaulin)]
COVERAGE_TEST_PARAMS_3_BITS_63_U64
});
};
}

View File

@@ -9,7 +9,6 @@ pub struct ClassicBootstrapKeys<Scalar: UnsignedInteger> {
pub small_lwe_sk: LweSecretKey<Vec<Scalar>>,
pub big_lwe_sk: LweSecretKey<Vec<Scalar>>,
pub bsk: LweBootstrapKeyOwned<Scalar>,
pub fbsk: FourierLweBootstrapKeyOwned,
}

View File

@@ -2,6 +2,7 @@
pub(crate) mod mask_random_generator;
pub(crate) mod noise_random_generator;
#[cfg(not(tarpaulin))]
#[cfg(test)]
mod test;

Some files were not shown because too many files have changed in this diff Show More